In [0]:
import pandas as pd
import json

In [8]:
# Read in data with carbon footprints of 500 cities (we will end with 336)
footprint_data = pd.read_csv('http://citycarbonfootprints.info/GGMCF_top500cities.txt', encoding='utf8', sep='	', header=0)
footprint_data.head()

Unnamed: 0,Urban Cluster,Country,Footprint/cap (t CO2),Population,Footprint (Mt CO2),1 StdDev,ClusterID,Global ranking,Domestic ranking
0,Seoul,South Korea,13.0 &plusmn;2.4,21254000,276.1 &plusmn;51.8,51.8,13127,1,1
1,Guangzhou,China,6.1 &plusmn;1.0,44309000,272.0 &plusmn;46.2,46.2,12827,2,1
2,New York,USA,17.1 &plusmn;5.5,13648000,233.5 &plusmn;75.4,75.4,864,3,1
3,Hong Kong SAR,China,34.6 &plusmn;6.3,6029000,208.5 &plusmn;37.8,37.8,13847,4,1
4,Los Angeles,USA,14.6 &plusmn;3.2,13482000,196.4 &plusmn;43.7,43.7,15,5,2


In [0]:
def wrangle(df):

    '''Cleans the footprint_data df'''

    # Drop unnecessary columns
    df = df.drop(columns=['Footprint/cap (t CO2)', 'Population', '1 StdDev', 'Global ranking', 'Domestic ranking', 'ClusterID', 'Country'])

    # Rename columns
    df.columns = ['urban_cluster', 'carbon_footprint']

    # Clean footprint column
    df['carbon_footprint'] = df['carbon_footprint'].str.replace('&plusmn;',',')

    # Strip whitespace, return first value and leave out confidence interval
    df['carbon_footprint'] = [row.split(',')[0].strip() for row in df['carbon_footprint']]

    return df

In [4]:
df_1 = wrangle(footprint_data)
df_1.head()

Unnamed: 0,urban_cluster,carbon_footprint
0,Seoul,276.1
1,Guangzhou,272.0
2,New York,233.5
3,Hong Kong SAR,208.5
4,Los Angeles,196.4


In [9]:
# Read in lat/lon data of thousands of cities
lat_lon_data = pd.read_csv('worldcities.csv')
lat_lon_data.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [0]:
def wrangle_lat_lon(df):

    # Rename columns
    df.rename(columns={'city':'urban_cluster'}, inplace=True)

    return df

In [12]:
df_2 = wrangle_lat_lon(lat_lon_data)
df_2.head()

Unnamed: 0,urban_cluster,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [13]:
# Merge our two dataframes
merged = pd.merge(df_2[['urban_cluster','lat', 'lng']], df_1, on='urban_cluster', how='right')
merged.head()

Unnamed: 0,urban_cluster,lat,lng,carbon_footprint
0,New York,40.6943,-73.9249,233.5
1,Mexico City,19.4424,-99.131,55.7
2,Mumbai,19.017,72.857,32.1
3,Shanghai,31.2165,121.4365,181.0
4,Kolkata,22.495,88.3247,42.9


In [0]:
def wrangle_merged(df):

    '''Wrangles the final df'''

    df = df.copy()

    # Drop duplicates
    df.drop_duplicates(subset='urban_cluster', inplace=True)

    # Drop nulls
    df = df[df['lat'].notnull()]

    # Change dtype of footprint
    df['carbon_footprint'] = df['carbon_footprint'].astype(float)

    return df

In [16]:
final = wrangle_merged(merged)

print('Shape:', final.shape)

final.head()

Shape: (336, 4)


Unnamed: 0,urban_cluster,lat,lng,carbon_footprint
0,New York,40.6943,-73.9249,233.5
1,Mexico City,19.4424,-99.131,55.7
2,Mumbai,19.017,72.857,32.1
3,Shanghai,31.2165,121.4365,181.0
4,Kolkata,22.495,88.3247,42.9


In [20]:
final['urban_cluster'].unique()

array(['New York', 'Mexico City', 'Mumbai', 'Shanghai', 'Kolkata',
       'Los Angeles', 'Dhaka', 'Buenos Aires', 'Karachi',
       'Rio de Janeiro', 'Beijing', 'Manila', 'Moscow', 'Istanbul',
       'Paris', 'Seoul', 'Lagos', 'Jakarta', 'Guangzhou', 'Chicago',
       'London', 'Lima', 'Tehran', 'Wuhan', 'Tianjin', 'Chennai',
       'Taipei', 'Lahore', 'Chongqing', 'Miami', 'Hyderabad', 'Dallas',
       'Santiago', 'Philadelphia', 'Belo Horizonte', 'Madrid', 'Houston',
       'Ho Chi Minh City', 'Atlanta', 'Toronto', 'Luanda', 'Baghdad',
       'Barcelona', 'Shenyang', 'Pune', 'Boston', 'Sydney',
       'Saint Petersburg', 'Riyadh', 'Hanoi', 'Guadalajara', 'Melbourne',
       'Alexandria', 'Chengdu', 'Phoenix', 'Porto Alegre', 'Ankara',
       'Monterrey', 'Nanjing', 'Guiyang', 'Recife', 'Seattle', 'Harbin',
       'Fortaleza', 'Detroit', 'Johannesburg', 'Berlin', 'Algiers',
       'Rome', 'Athens', 'Nagoya', 'Cape Town', 'San Diego', 'Changchun',
       'Casablanca', 'Dalian', 'Tel Av

In [0]:
# Drop keys, just get values
output = final.to_json(orient='values')

In [0]:
# Output to json for FE
final.to_json('globe_data_compressed.json', orient='values', compression='gzip')
final.to_json('globe_data_uncompressed.json', orient='values')