### city_embedding.ipynb

Contains the code for web-scraping and generating text embeddings for the CityEmbed dataset.

In [19]:
import requests
from bs4 import BeautifulSoup
import openai
import os
from openai.embeddings_utils import get_embedding, cosine_similarity
from tqdm import tqdm
import pandas as pd
import math
import numpy as np

In [2]:
# OpenAI key (needs to be provided)
os.environ["OPENAI_API_KEY"] = "XXXXXXXXXXXXXXXXXXX"

In [3]:
# using "List of cities by GDP" Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_cities_by_GDP'

In [7]:
# requesting content
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# finding all tables in the HTML
tables = soup.find_all("table", class_="wikitable")

# selecting the second table
if len(tables) >= 2:
    table = tables[1]
else:
    raise ValueError("The page doesn't contain at least two tables.")

# extracting header
headers = [header.text.strip() for header in table.find_all("th")]

# adding new column
headers.append("city_wikipedia")

# extracting rows
rows = []
for row in table.find_all("tr")[1:]:
    data = [cell.text.strip() for cell in row.find_all("td")]

    # adding Wikipedia link
    city_link = row.find("a", href=True)
    city_wikipedia = "https://en.wikipedia.org" + city_link["href"] if city_link else None
    data.append(city_wikipedia)

    rows.append(data)
    
# saving as dataframe
cities = pd.DataFrame(rows, columns=headers)

In [8]:
cities

Unnamed: 0,Rank,City proper/metropolitan area,Country/region,UNSD\nsub‑region[4],Official est. GDP\nup to date\n(billion US$),Brookings[7] 2014 est.\nPPP-adjusted GDP\n(billion US$),Metropolitan population,Official est. GDP per capita,city_wikipedia
0,129,Aachen-Liège-Maastricht,Germany Belgium Netherlands,Western Europe,,99.728,"3,500,000 (2014)[8] metropolitan population fo...",28493.71,https://en.wikipedia.org/wiki/Aachen
1,300+,"Abbotsford, British Columbia",Canada,North America,6.141 (2019)[10],,"202,497 (2019)[11]",30321.44,"https://en.wikipedia.org/wiki/Abbotsford,_Brit..."
2,300+,Aberdeen,United Kingdom,Northern Europe,23.0 (2020)[12],,"489,840 (2020)[13]",46957.94,https://en.wikipedia.org/wiki/Aberdeen
3,300+,Abidjan,Ivory Coast,Africa,27 (2017)[14],,"5,950,000 (2022)[15]",4537.82,https://en.wikipedia.org/wiki/Abidjan
4,106,Abu Dhabi,United Arab Emirates,Western Asia,119 (2015)[16],178.256,"1,660,000 (2022)[15]",71686.75,https://en.wikipedia.org/wiki/Abu_Dhabi
...,...,...,...,...,...,...,...,...,...
446,94,Zhengzhou,China,Eastern Asia,144 (2018)[30],155.696,"8,950,000 (2022)[15]",16089.39,https://en.wikipedia.org/wiki/Zhengzhou
447,228,Zhongshan,China,Eastern Asia,51.1 (2017)[30],68.682,"4,418,060 (2020)[190]",11566.16,https://en.wikipedia.org/wiki/Zhongshan
448,287,Zhuhai,China,Eastern Asia,38.0 (2017)[30],41.338,"2,325,000 (2022)[15]",16344.09,https://en.wikipedia.org/wiki/Zhuhai
449,179,Zibo,China,Eastern Asia,70.8 (2017)[30],100.274,"3,725,000 (2022)[15]",19006.71,https://en.wikipedia.org/wiki/Zibo


In [9]:
def get_intro_text(wikipedia_url):
    response = requests.get(wikipedia_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # finding div containing paragraphs
    content_div = soup.find('div', class_='mw-body-content')

    # getting first <h2> element
    first_h2 = content_div.find('h2')

    # checking if an <h2> element was found
    if first_h2:
        # extracting all the paragraph elements before the first <h2>
        paragraphs = []
        for sibling in first_h2.find_all_previous():
            if sibling.name == 'p':
                paragraphs.append(sibling.get_text())
            elif sibling.name == 'h2':
                break

        # reversing the order of the paragraphs and join them into a single string
        intro_text = ''.join(reversed(paragraphs))
        
        # cleaning
        intro_text = intro_text.strip().replace('\n','')
        return intro_text
    
    else:
        return np.NaN

In [11]:
# printing sample output for Tokyo
get_intro_text('https://en.wikipedia.org/wiki/Tokyo')

'Tokyo (/ˈtoʊkioʊ/;[7] Japanese: 東京, Tōkyō, [toːkʲoː] (listen)), officially the Tokyo Metropolis (東京都, Tōkyō-to), is the capital and most populous city of Japan.[8] Formerly known as Edo, its metropolitan area (13,452 square kilometers or 5,194 square miles) is the most populous in the world, with an estimated 37.468\xa0million residents as of 2018[update];[9] the city proper has a population of 13.99\xa0million people.[4] Located at the head of Tokyo Bay, the prefecture forms part of the Kantō region on the central coast of Honshu, Japan\'s largest island. Tokyo serves as Japan\'s economic center and is the seat of both the Japanese government and the Emperor of Japan.Originally a fishing village named Edo, the city became politically prominent in 1603, when it became the seat of the Tokugawa shogunate. By the mid-18th century, Edo was one of the most populous cities in the world with a population of over one\xa0million people. Following the Meiji Restoration of 1868, the imperial cap

In [10]:
tqdm.pandas()

# getting wikipedia intro for each city
cities['wikipedia_intro'] = cities['city_wikipedia'].progress_apply(get_intro_text)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 451/451 [03:19<00:00,  2.26it/s]


In [12]:
def get_embedding_batch(df):
    items = df['wikipedia_intro'].tolist()

    # create batches of 1,000
    batch_size = 1000
    num_batches = math.ceil(len(items) / batch_size)
    embeddings = []
    
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = min(batch_start + batch_size, len(items))
        batch = items[batch_start:batch_end]
        o = 0
        
        while True:
            try:
                out = openai.Embedding.create(
                model="text-embedding-ada-002",
                    input=batch)
                break
            
            except:
                if o > 3:
                    out = ["Error"] * 25
                    break
                    
                time.sleep(60)
                o += 1
                print("Error.")
        
        try:
            embeddings.extend(out['data'])
        
        except:
            embeddings.extend(out)
        
        print(f"{i}/{num_batches}")
        
    # add embeddings to the "ada_embeddings" column
    df['ada_embeddings'] = embeddings

    return df

In [13]:
# getting embeddings
cities = get_embedding_batch(cities)

0/1


In [14]:
# extracting embeddings from JSON
cities['embedding'] = cities['ada_embeddings'].apply(lambda x: x.embedding)

In [16]:
# renaming columns
cities['city'] = cities['City proper/metropolitan area']
cities['country'] = cities['Country/region']
cities['region'] = cities['UNSD\nsub‑region[4]']
cities['gdp_pc'] = cities['Official est. GDP per capita']
cities['wikipedia_url'] = cities['city_wikipedia']

# saving reduced dataframe
cities = cities[['city', 'country', 'region', 'gdp_pc', 'wikipedia_url', 'wikipedia_intro', 'embedding']]

In [17]:
cities

Unnamed: 0,city,country,region,gdp_pc,wikipedia_url,wikipedia_intro,embedding
0,Aachen-Liège-Maastricht,Germany Belgium Netherlands,Western Europe,28493.71,https://en.wikipedia.org/wiki/Aachen,Aachen (/ˈɑːxən/ AH-khən; German: [ˈaːxn̩] (li...,"[0.025970444083213806, 0.010382960550487041, 0..."
1,"Abbotsford, British Columbia",Canada,North America,30321.44,"https://en.wikipedia.org/wiki/Abbotsford,_Brit...",Abbotsford is a city located in British Columb...,"[0.002416889648884535, -0.01665973663330078, -..."
2,Aberdeen,United Kingdom,Northern Europe,46957.94,https://en.wikipedia.org/wiki/Aberdeen,Aberdeen (/ˌæbərˈdiːn/ (listen); Scots: Aiberd...,"[0.008810719475150108, -0.020789921283721924, ..."
3,Abidjan,Ivory Coast,Africa,4537.82,https://en.wikipedia.org/wiki/Abidjan,"Abidjan (/ˌæbɪˈdʒɑːn/ AB-ih-JAHN, French: [abi...","[-0.01380961760878563, -0.01290480513125658, 0..."
4,Abu Dhabi,United Arab Emirates,Western Asia,71686.75,https://en.wikipedia.org/wiki/Abu_Dhabi,"Abu Dhabi (UK: /ˈæbuːˈdæbi/, US: /ˈɑːbuːˈdɑːbi...","[-0.006597555708140135, -0.014655590988695621,..."
...,...,...,...,...,...,...,...
446,Zhengzhou,China,Eastern Asia,16089.39,https://en.wikipedia.org/wiki/Zhengzhou,"Zhengzhou (/dʒɛŋˈdʒoʊ, dʒʌŋ-/;[3] simplified C...","[0.0016842980403453112, -0.023999657481908798,..."
447,Zhongshan,China,Eastern Asia,11566.16,https://en.wikipedia.org/wiki/Zhongshan,Zhongshan ([ʈʂʊ́ŋ ʂán]; Chinese: 中山) is a pre...,"[0.0200781412422657, -0.00019213088671676815, ..."
448,Zhuhai,China,Eastern Asia,16344.09,https://en.wikipedia.org/wiki/Zhuhai,"Zhuhai (/ˈdʒuːˈhaɪ/,[3] Chinese: 珠海; pinyin: Z...","[0.0067623937502503395, -0.011905440129339695,..."
449,Zibo,China,Eastern Asia,19006.71,https://en.wikipedia.org/wiki/Zibo,"Zibo (Chinese: 淄博, tsee-PWOH) is a prefecture-...","[-0.015531720593571663, -0.024248503148555756,..."


In [18]:
# saving as csv
cities.to_csv('city_embedding_data.csv')