Background

* https://www.numbeo.com/cost-of-living/in/San-Francisco
* [Indexes explained](https://www.numbeo.com/cost-of-living/cpi_explained.jsp)
* https://www.numbeo.com/api/doc.jsp 

In [1]:
import pandas as pd
import numpy as np

from bokeh.io import output_notebook
from bokeh.plotting import show, ColumnDataSource, figure
from bokeh.charts import Scatter
from bokeh.models import LabelSet

output_notebook()

In [2]:
cities = pd.read_csv("data/cost_of_living_index_2019.csv")

In [3]:
cities["Country"] = cities["City"].apply(lambda x: x.split(", ")[-1])
cities["City"] = cities["City"].apply(lambda x: x.split(", ")[0])

In [4]:
cities.head()

Unnamed: 0,Region,City,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index,Country
0,Africa,Harare,59.36,13.57,37.28,53.64,42.28,27.69,Zimbabwe
1,Africa,Pretoria,47.17,15.29,31.8,33.88,40.99,81.05,South Africa
2,Africa,Johannesburg,45.06,17.0,31.53,35.57,41.78,80.86,South Africa
3,Africa,Nairobi,43.68,13.76,29.25,39.25,35.57,25.99,Kenya
4,Africa,Cape Town,42.71,25.29,34.31,35.01,41.47,82.41,South Africa


In [5]:
cities.columns

Index(['Region', 'City', 'Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index', 'Country'],
      dtype='object')

In [6]:
# https://stackoverflow.com/questions/43983039/color-points-in-scatter-plot-of-bokeh
# http://bokeh.pydata.org/en/latest/docs/user_guide/annotations.html#legends
import bokeh.models as bmo
from bokeh.palettes import d3

# use whatever palette you want...
palette = d3['Category10'][len(cities['Region'].unique())]
color_map = bmo.CategoricalColorMapper(
                factors=list(cities['Region'].unique()),
                palette=palette)

In [7]:
scatter_plot = Scatter(
                cities,
                x='Cost of Living Index',
                y='Rent Index',
                title='Cost: living vs rent',
                color={'field': 'Region', 'transform': color_map})


# p = figure(x_range=(0, 7), y_range=(0, 3), height=300, tools='save')
p = figure(width=1000, height=1000, tools='save',
           x_axis_type="log", y_axis_type="log")


p.circle(
    x='Cost of Living Index',
    y='Rent Index',
    size=5,
    source=ColumnDataSource(cities),
    color={'field': 'Region', 'transform': color_map},
    legend='Region',
    alpha=1.,
)
p.xaxis[0].axis_label = 'Cost of Living Index'
p.yaxis[0].axis_label = 'Rent Index'

labels = LabelSet(
            x='Cost of Living Index',
            y='Rent Index',
            text='City',
            level='glyph',
            x_offset=5, 
            y_offset=5,
            text_font_size="4pt",
            source=ColumnDataSource(cities), 
            render_mode='canvas')

p.add_layout(labels)


show(p)
#show(scatter_plot)

In [29]:
def plot(x_col, y_col, cities=cities, log=True):

    if log:
        p = figure(width=1000, height=1000,
                   x_axis_type="log", y_axis_type="log")
    else:
        p = figure(width=1000, height=1000)
        
    p.circle(
        x=x_col,
        y=y_col,
        size=5,
        source=ColumnDataSource(cities),
        color={'field': 'Region', 'transform': color_map},
        legend='Region',
        alpha=1.,
    )
    p.xaxis[0].axis_label = x_col
    p.yaxis[0].axis_label = y_col

    labels = LabelSet(
                x=x_col,
                y=y_col,
                text='City',
                level='glyph',
                x_offset=2, 
                y_offset=2,
                text_font_size="6pt",
                source=ColumnDataSource(cities), 
                render_mode='canvas')

    p.add_layout(labels)

    show(p)

In [9]:
plot('Groceries Index', 'Restaurant Price Index')

In [10]:
cities['Relative restaurant to groceries'] = cities['Restaurant Price Index'] / cities['Groceries Index']

plot('Groceries Index', 'Relative restaurant to groceries')

In [11]:
plot('Cost of Living Plus Rent Index', 'Local Purchasing Power Index')

In [12]:
cities['Relative rent to cost of living'] = cities['Rent Index'] / cities['Cost of Living Index']

plot('Cost of Living Index', 'Relative rent to cost of living')

In [13]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [15]:
cities.columns

Index(['Region', 'City', 'Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index', 'Country',
       'Relative restaurant to groceries', 'Relative rent to cost of living'],
      dtype='object')

In [30]:
city_data = cities.loc[:, ['Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index']]
city_data = np.log2(city_data / 100.)

In [26]:
pca = PCA(n_components=2)
pcaed = pca.fit_transform(city_data)

In [27]:
cities["pc1"] = pcaed[:, 0]
cities["pc2"] = pcaed[:, 1]

In [31]:
plot('pc1', 'pc2', log=False)

In [34]:
tsne = TSNE(perplexity=30.)
tsned = tsne.fit_transform(city_data)
cities["tsne1"] = tsned[:, 0]
cities["tsne2"] = tsned[:, 1]
plot('tsne1', 'tsne2', log=False)

In [None]:
# maybe more normalized data?

In [43]:
city_data = cities.loc[:, ['Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index']]
city_data_n = city_data.div(city_data['Cost of Living Plus Rent Index'], axis='rows')
city_data_n['Cost of Living Plus Rent Index'] = city_data['Cost of Living Plus Rent Index']  # unnormalize this one
city_data_n = np.log2(city_data_n)

In [45]:
tsne = TSNE(perplexity=10.)
tsned = tsne.fit_transform(city_data_n)
cities["tsne1"] = tsned[:, 0]
cities["tsne2"] = tsned[:, 1]
plot('tsne1', 'tsne2', log=False)