In [14]:
from bs4 import BeautifulSoup
import pandas as pd
from PIL import Image
import requests
from io import BytesIO
import numpy as np

In [15]:
def process_img(url):
    # getting the image from the url
    response = requests.get(url)
    image_bytes = BytesIO(response.content)
    img = Image.open(image_bytes)
    
    # converting it to RGB
    img = img.convert("RGB")
    
    # resizing to 180 x 90
    img = img.resize((180, 90))
    
    # converting an array of pixels
    img = np.array(img)
    return img

In [16]:
def get_flags():
    html = requests.get("https://en.wikipedia.org/wiki/Gallery_of_sovereign_state_flags").text
    soup = BeautifulSoup(html, "lxml")
    
    flags = soup.find_all("li", class_ = "gallerybox")
    flag_df = pd.DataFrame(columns = ["country", "flag"])
    
    i = 1
    
    for flag in flags:
        # getting the url of the flag and converting to a 180 x 90 RGB image (as a numpy array)
        img_url = "https:" + flag.find("img").get("src")
        img = process_img(img_url)
        
        # getting the name of the country
        country = flag.find("div", class_ = "gallerytext").find("a").get("title")
        country = country.split("Flag of ")[1].title()
        
        flag_df = flag_df.append({"country": country, "flag": img}, ignore_index = True)
        
        if i % 25 == 0:
            print("finished", i, "flags")
        i += 1
        
        
    print("Done!")
    flag_df = flag_df.set_index("country")
    return flag_df

In [17]:
flag_df = get_flags()

finished 25 flags
finished 50 flags
finished 75 flags
finished 100 flags
finished 125 flags
finished 150 flags
finished 175 flags
finished 200 flags
Done!


In [18]:
flag_df.head()

Unnamed: 0_level_0,flag
country,Unnamed: 1_level_1
Afghanistan,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
Albania,"[[[255, 0, 0], [255, 0, 0], [255, 0, 0], [255,..."
Algeria,"[[[0, 98, 51], [0, 98, 51], [0, 98, 51], [0, 9..."
Andorra,"[[[0, 24, 168], [0, 24, 168], [0, 24, 168], [0..."
Angola,"[[[204, 9, 47], [204, 9, 47], [204, 9, 47], [2..."


In [19]:
flag_df.to_csv("flag_df.csv")