<a href="https://colab.research.google.com/github/rcugarte/cityembed/blob/main/visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CityEmbed Visualizations

See the full GitHub repository [here](https://github.com/rcugarte/cityembed).

### Data Preparation

In [20]:
import pandas as pd
import ast
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
import umap.umap_ as umap
import matplotlib.pyplot as plt
import plotly.colors as pc
import plotly.express as px

In [6]:
url = 'https://raw.githubusercontent.com/rcugarte/cityembed/master/city_embedding_data.csv'
cities = pd.read_csv(url)

In [7]:
cities

Unnamed: 0.1,Unnamed: 0,city,country,region,gdp_pc,wikipedia_url,wikipedia_intro,embedding
0,0,Aachen-Liège-Maastricht,Germany Belgium Netherlands,Western Europe,28493.71,https://en.wikipedia.org/wiki/Aachen,Aachen (/ˈɑːxən/ AH-khən; German: [ˈaːxn̩] (li...,"[0.025970444083213806, 0.010382960550487041, 0..."
1,1,"Abbotsford, British Columbia",Canada,North America,30321.44,"https://en.wikipedia.org/wiki/Abbotsford,_Brit...",Abbotsford is a city located in British Columb...,"[0.002416889648884535, -0.01665973663330078, -..."
2,2,Aberdeen,United Kingdom,Northern Europe,46957.94,https://en.wikipedia.org/wiki/Aberdeen,Aberdeen (/ˌæbərˈdiːn/ (listen); Scots: Aiberd...,"[0.008810719475150108, -0.020789921283721924, ..."
3,3,Abidjan,Ivory Coast,Africa,4537.82,https://en.wikipedia.org/wiki/Abidjan,"Abidjan (/ˌæbɪˈdʒɑːn/ AB-ih-JAHN, French: [abi...","[-0.01380961760878563, -0.01290480513125658, 0..."
4,4,Abu Dhabi,United Arab Emirates,Western Asia,71686.75,https://en.wikipedia.org/wiki/Abu_Dhabi,"Abu Dhabi (UK: /ˈæbuːˈdæbi/, US: /ˈɑːbuːˈdɑːbi...","[-0.006597555708140135, -0.014655590988695621,..."
...,...,...,...,...,...,...,...,...
446,446,Zhengzhou,China,Eastern Asia,16089.39,https://en.wikipedia.org/wiki/Zhengzhou,"Zhengzhou (/dʒɛŋˈdʒoʊ, dʒʌŋ-/;[3] simplified C...","[0.0016842980403453112, -0.023999657481908798,..."
447,447,Zhongshan,China,Eastern Asia,11566.16,https://en.wikipedia.org/wiki/Zhongshan,Zhongshan ([ʈʂʊ́ŋ ʂán]; Chinese: 中山) is a pre...,"[0.0200781412422657, -0.00019213088671676815, ..."
448,448,Zhuhai,China,Eastern Asia,16344.09,https://en.wikipedia.org/wiki/Zhuhai,"Zhuhai (/ˈdʒuːˈhaɪ/,[3] Chinese: 珠海; pinyin: Z...","[0.0067623937502503395, -0.011905440129339695,..."
449,449,Zibo,China,Eastern Asia,19006.71,https://en.wikipedia.org/wiki/Zibo,"Zibo (Chinese: 淄博, tsee-PWOH) is a prefecture-...","[-0.015531720593571663, -0.024248503148555756,..."


In [11]:
# formatting embeddings appropriately
cities['embedding'] = cities['embedding'].apply(lambda x: ast.literal_eval(x))

### Global UMAP Visualization

In [14]:
# load the embeddings and names from the dataframe
embeddings = np.array(cities['embedding'].tolist())
names = np.array(cities['city'].tolist())

In [21]:
# perform UMAP on the embeddings
umap_result = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean').fit_transform(embeddings)
    
# map categories to integer codes
cat_codes, cat_names = pd.factorize(cities['region'])

# map categories to colors
colors = px.colors.qualitative.Plotly
color_dict = {i % len(colors): colors[i % len(colors)] for i in range(len(cat_names))}
    
# create a scatter plot of the UMAP results
scatter = go.Scatter(
    x=umap_result[:, 0],
    y=umap_result[:, 1],
    mode='markers',
    marker=dict(
        size=5,
        sizemode='diameter',
        color=pd.Series(cat_codes).map(color_dict),
        opacity=0.8,
        showscale=False
    ),
    text=names,
    hoverinfo='text',
    showlegend=False
)

# set plot title and axis labels
layout = go.Layout(
    title="UMAP City Embeddings Visualization (Global)",
    xaxis=dict(title="UMAP Component 1"),
    yaxis=dict(title="UMAP Component 2"),
    hovermode='closest',
    legend=dict(title='Region')
)

# create the figure
fig = go.Figure(data=[scatter], layout=layout)

# add legend with discrete colors and labels
for code, name in enumerate(cat_names):
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(size=10, color=color_dict[code % len(colors)]), showlegend=True, name=name))

# show the plot
pio.show(fig)

### United States UMAP Visualization

In [23]:
# restricting to only cities in the US
us_cities = cities[cities['country'] == 'United States']

In [24]:
us_cities

Unnamed: 0.1,Unnamed: 0,city,country,region,gdp_pc,wikipedia_url,wikipedia_intro,embedding
7,7,Akron,United States,North America,58655.89,https://en.wikipedia.org/wiki/Akron,Akron (/ˈækrən/) is the fifth-largest city in ...,"[0.013709683902561665, -0.0066631874069571495,..."
8,8,Albany,United States,North America,82287.50,"https://en.wikipedia.org/wiki/Albany,_New_York",Albany (/ˈɔːlbəni/ (listen) AWL-bən-ee) is the...,"[0.02120584063231945, -0.0038955307099968195, ..."
9,9,Albuquerque,United States,North America,53884.58,https://en.wikipedia.org/wiki/Albuquerque,Albuquerque (/ˈælbəˌkɜːrki/ (listen) AL-bə-kur...,"[-0.01403997652232647, 0.0021814152132719755, ..."
11,11,Allentown,United States,North America,58892.19,"https://en.wikipedia.org/wiki/Allentown,_Penns...","Allentown (Pennsylvania Dutch: Allenschteddel,...","[-0.009336583316326141, 0.000967484200373292, ..."
19,19,Atlanta,United States,North America,77118.51,https://en.wikipedia.org/wiki/Atlanta,Atlanta (/ætˈlæntə/ at-LAN-tə) is the capital ...,"[-0.01681656762957573, -0.0009408590267412364,..."
...,...,...,...,...,...,...,...,...
416,416,Tulsa,United States,North America,58975.30,https://en.wikipedia.org/wiki/Tulsa,Tulsa (/ˈtʌlsə/) is the second-largest city in...,"[-0.01718195527791977, -0.01315036416053772, 0..."
426,426,Virginia Beach,United States,North America,59373.56,"https://en.wikipedia.org/wiki/Virginia_Beach,_...",Virginia Beach is an independent city located ...,"[-0.025677744299173355, -0.029091402888298035,..."
431,431,"Washington, D.C.",United States,North America,95592.91,"https://en.wikipedia.org/wiki/Washington,_D.C.","Washington, D.C., formally the District of Col...","[-0.00948111992329359, 0.004734338726848364, -..."
433,433,Wichita,United States,North America,62646.72,"https://en.wikipedia.org/wiki/Wichita,_Kansas",Wichita (/ˈwɪtʃɪtɔː/ WITCH-ih-taw)[10] is the ...,"[-0.014312542043626308, -0.006363625638186932,..."


In [25]:
# load the embeddings and names from the dataframe
us_embeddings = np.array(us_cities['embedding'].tolist())
us_names = np.array(us_cities['city'].tolist())

In [29]:
# perform UMAP on the embeddings
umap_result = umap.UMAP(n_neighbors=5, min_dist=0.1, metric='euclidean').fit_transform(us_embeddings)

# create a scatter plot of the UMAP results
scatter = go.Scatter(
    x=umap_result[:, 0],
    y=umap_result[:, 1],
    mode='markers',
    marker=dict(
        size=5,
        sizemode='diameter',
        color='red',
        opacity=0.8,
        showscale=False
    ),
    text=us_names,
    hoverinfo='text',
    showlegend=False
)

# set plot title and axis labels
layout = go.Layout(
    title="UMAP City Embeddings Visualization (United States)",
    xaxis=dict(title="UMAP Component 1"),
    yaxis=dict(title="UMAP Component 2"),
    hovermode='closest',
    legend=dict(title='Region')
)

# create the figure
fig = go.Figure(data=[scatter], layout=layout)

# show the plot
pio.show(fig)