In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psycopg2
import sqlalchemy
import geopandas as gpd
import geoalchemy2
import configparser

## Fetching data from database

In [3]:
config = configparser.ConfigParser()
config.read("../../config/config.ini")    
db_params = dict(config['DB'])

conn_string = 'postgresql://{user}:{password}@{host}:{port}/{dbname}'.format(**db_params)
db = sqlalchemy.create_engine(conn_string)
conn = db.connect()
metadata = sqlalchemy.MetaData()

In [3]:
accessibility_table = sqlalchemy.Table('accessibility_stats', metadata, autoload=True, autoload_with=db)
#Equivalent to 'SELECT * FROM accessibility_stats'
query = sqlalchemy.select([accessibility_table]) 
accessibility = conn.execute(query).fetchall()

In [4]:
h3_demo_table = sqlalchemy.Table('h3demographics', metadata, autoload=True, autoload_with=db)
#Equivalent to 'SELECT * FROM h3demographics'
query = sqlalchemy.select([h3_demo_table]) 
h3_demo = conn.execute(query).fetchall()

In [5]:
conn.close()

## City accessibility comparision

In [6]:
acc_df = pd.DataFrame(accessibility)
acc_df

Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility
0,1739617,3,Race,Schools and pre-schools,evening,8929a5659dbffff,2.107254
1,1739618,3,Race,Schools and pre-schools,evening,8929a565a03ffff,3.623966
2,1739619,3,Race,Schools and pre-schools,evening,8929a565a07ffff,1.772482
3,1739620,3,Race,Schools and pre-schools,evening,8929a565a0bffff,1.772482
4,1739621,3,Race,Schools and pre-schools,evening,8929a565a0fffff,1.772482
...,...,...,...,...,...,...,...
4615485,1739612,3,Race,Schools and pre-schools,evening,8929a56594fffff,0.620469
4615486,1739613,3,Race,Schools and pre-schools,evening,8929a565953ffff,0.620469
4615487,1739614,3,Race,Schools and pre-schools,evening,8929a56595bffff,1.509975
4615488,1739615,3,Race,Schools and pre-schools,evening,8929a5659c3ffff,2.107254


In [7]:
h3_demographic_df = pd.DataFrame(h3_demo)
h3_demographic_df

Unnamed: 0,cityid,categorytype,groupname,h3id,population,id
0,4,Age and Sex,Under 18 years female,8926645a963ffff,0.960630,1898645
1,4,Age and Sex,18 to 45 years female,8926645a963ffff,3.307087,1898646
2,4,Age and Sex,45 to 65 years female,8926645a963ffff,2.062992,1898647
3,4,Age and Sex,65 years and over female,8926645a963ffff,3.519685,1898648
4,4,Race,White,8926645a963ffff,15.574803,1898649
...,...,...,...,...,...,...
3419970,4,Race,American Indian and Alaska Native,8926645a963ffff,0.000000,1898640
3419971,4,Age and Sex,Under 18 years male,8926645a963ffff,1.133858,1898641
3419972,4,Age and Sex,18 to 45 years male,8926645a963ffff,1.834646,1898642
3419973,4,Age and Sex,45 to 65 years male,8926645a963ffff,1.543307,1898643


In [8]:
population_filter = h3_demographic_df["categorytype"] == "Age and Sex"
h3_demo_fltrd = h3_demographic_df[population_filter]
h3_demo_fltrd

Unnamed: 0,cityid,categorytype,groupname,h3id,population,id
0,4,Age and Sex,Under 18 years female,8926645a963ffff,0.960630,1898645
1,4,Age and Sex,18 to 45 years female,8926645a963ffff,3.307087,1898646
2,4,Age and Sex,45 to 65 years female,8926645a963ffff,2.062992,1898647
3,4,Age and Sex,65 years and over female,8926645a963ffff,3.519685,1898648
21,4,Age and Sex,Under 18 years male,8926645a937ffff,1.133858,1898666
...,...,...,...,...,...,...
3419953,4,Age and Sex,65 years and over female,8926645a967ffff,3.519685,1898623
3419971,4,Age and Sex,Under 18 years male,8926645a963ffff,1.133858,1898641
3419972,4,Age and Sex,18 to 45 years male,8926645a963ffff,1.834646,1898642
3419973,4,Age and Sex,45 to 65 years male,8926645a963ffff,1.543307,1898643


In [9]:
h3_pop_df = h3_demo_fltrd.groupby(["cityid", "h3id"]).sum()["population"].reset_index()[["h3id", "population"]]
h3_pop_df

Unnamed: 0,h3id,population
0,8944c100907ffff,256.800000
1,8944c100917ffff,266.777778
2,8944c100923ffff,256.800000
3,8944c100927ffff,123.909091
4,8944c10092fffff,256.800000
...,...,...
136794,8926cbd75cbffff,581.250000
136795,8926cbd75cfffff,229.666667
136796,8926cbd75d3ffff,70.400000
136797,8926cbd75d7ffff,639.000000


In [10]:
acc_df_mrgd = pd.merge(acc_df, h3_pop_df, how='inner', on = 'h3id')
acc_df_mrgd

Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility,population
0,1739617,3,Race,Schools and pre-schools,evening,8929a5659dbffff,2.107254,0.000000
1,1762442,3,Age and Sex,Schools and pre-schools,evening,8929a5659dbffff,2.107254,0.000000
2,1785267,3,Income,Schools and pre-schools,evening,8929a5659dbffff,4.848244,0.000000
3,1808092,3,Origin,Schools and pre-schools,evening,8929a5659dbffff,2.107254,0.000000
4,1830917,3,Vehicle Availability,Schools and pre-schools,evening,8929a5659dbffff,4.848244,0.000000
...,...,...,...,...,...,...,...,...
4615485,3441508,4,Race,Grocery stores and supermarkets,evening,892664c8b73ffff,0.079719,42.516129
4615486,3452073,4,Age and Sex,Grocery stores and supermarkets,evening,892664c8b73ffff,0.079719,42.516129
4615487,3462638,4,Income,Grocery stores and supermarkets,evening,892664c8b73ffff,0.266190,42.516129
4615488,3473203,4,Origin,Grocery stores and supermarkets,evening,892664c8b73ffff,0.079719,42.516129


In [11]:
category_filter = acc_df_mrgd["categorytype"] == "Age and Sex"
time_filter = acc_df_mrgd["timeofday"] == "afternoon"
acc_df_mrgd_fltrd = acc_df_mrgd[category_filter & time_filter]
acc_df_mrgd_fltrd

Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility,population
11,1647997,3,Age and Sex,Schools and pre-schools,afternoon,8929a5659dbffff,2.107254,0.000000
26,2587058,3,Age and Sex,Vaccination centers,afternoon,8929a565a03ffff,2.783466,0.000000
41,1896243,3,Age and Sex,Cinemas and Theaters,afternoon,8929a565a03ffff,0.195717,0.000000
56,2108145,3,Age and Sex,Grocery stores and supermarkets,afternoon,8929a565a03ffff,34.469766,0.000000
71,2346308,3,Age and Sex,Clinics and Hospitals,afternoon,8929a565a03ffff,0.347431,0.000000
...,...,...,...,...,...,...,...,...
4615421,3404285,4,Age and Sex,Grocery stores and supermarkets,afternoon,892759359b7ffff,0.870483,318.250000
4615456,3404736,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936a9bffff,0.527612,117.888889
4615466,3404908,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936e43ffff,0.527612,119.500000
4615476,3404909,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936e47ffff,0.527612,119.500000


In [12]:
city_pop_df = acc_df_mrgd_fltrd.groupby("cityid").sum()["population"].reset_index()
city_pop_df.columns = ["cityid", "citypop"]

In [13]:
acc_df_mrgd_fltrd["pop x acc"] = acc_df_mrgd_fltrd["accessibility"] * acc_df_mrgd_fltrd["population"]
acc_df_mrgd_fltrd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc_df_mrgd_fltrd["pop x acc"] = acc_df_mrgd_fltrd["accessibility"] * acc_df_mrgd_fltrd["population"]


Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility,population,pop x acc
11,1647997,3,Age and Sex,Schools and pre-schools,afternoon,8929a5659dbffff,2.107254,0.000000,0.000000
26,2587058,3,Age and Sex,Vaccination centers,afternoon,8929a565a03ffff,2.783466,0.000000,0.000000
41,1896243,3,Age and Sex,Cinemas and Theaters,afternoon,8929a565a03ffff,0.195717,0.000000,0.000000
56,2108145,3,Age and Sex,Grocery stores and supermarkets,afternoon,8929a565a03ffff,34.469766,0.000000,0.000000
71,2346308,3,Age and Sex,Clinics and Hospitals,afternoon,8929a565a03ffff,0.347431,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
4615421,3404285,4,Age and Sex,Grocery stores and supermarkets,afternoon,892759359b7ffff,0.870483,318.250000,277.031085
4615456,3404736,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936a9bffff,0.527612,117.888889,62.199556
4615466,3404908,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936e43ffff,0.527612,119.500000,63.049597
4615476,3404909,4,Age and Sex,Grocery stores and supermarkets,afternoon,89275936e47ffff,0.527612,119.500000,63.049597


In [14]:
acc_df_mrgd_fltrd = pd.merge(acc_df_mrgd_fltrd, city_pop_df, how='inner', on = 'cityid')
acc_df_mrgd_fltrd

Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility,population,pop x acc,citypop
0,1647997,3,Age and Sex,Schools and pre-schools,afternoon,8929a5659dbffff,2.107254,0.000000,0.00000,4.636897e+07
1,2587058,3,Age and Sex,Vaccination centers,afternoon,8929a565a03ffff,2.783466,0.000000,0.00000,4.636897e+07
2,1896243,3,Age and Sex,Cinemas and Theaters,afternoon,8929a565a03ffff,0.195717,0.000000,0.00000,4.636897e+07
3,2108145,3,Age and Sex,Grocery stores and supermarkets,afternoon,8929a565a03ffff,34.469766,0.000000,0.00000,4.636897e+07
4,2346308,3,Age and Sex,Clinics and Hospitals,afternoon,8929a565a03ffff,0.347431,0.000000,0.00000,4.636897e+07
...,...,...,...,...,...,...,...,...,...,...
305654,1416337,2,Age and Sex,Restaurants,afternoon,892a103b067ffff,0.085901,0.000000,0.00000,3.411227e+07
305655,1418023,2,Age and Sex,Restaurants,afternoon,892a106e4b7ffff,0.104970,0.000000,0.00000,3.411227e+07
305656,816793,2,Age and Sex,Schools and pre-schools,afternoon,892a100c963ffff,0.053793,77.666667,4.17791,3.411227e+07
305657,816794,2,Age and Sex,Schools and pre-schools,afternoon,892a100c96bffff,0.053793,77.666667,4.17791,3.411227e+07


In [15]:
acc_df_mrgd_fltrd["pop x acc / sum pop"] = acc_df_mrgd_fltrd["pop x acc"]/acc_df_mrgd_fltrd["citypop"]
acc_df_mrgd_fltrd

Unnamed: 0,id,cityid,categorytype,poi_category,timeofday,h3id,accessibility,population,pop x acc,citypop,pop x acc / sum pop
0,1647997,3,Age and Sex,Schools and pre-schools,afternoon,8929a5659dbffff,2.107254,0.000000,0.00000,4.636897e+07,0.000000e+00
1,2587058,3,Age and Sex,Vaccination centers,afternoon,8929a565a03ffff,2.783466,0.000000,0.00000,4.636897e+07,0.000000e+00
2,1896243,3,Age and Sex,Cinemas and Theaters,afternoon,8929a565a03ffff,0.195717,0.000000,0.00000,4.636897e+07,0.000000e+00
3,2108145,3,Age and Sex,Grocery stores and supermarkets,afternoon,8929a565a03ffff,34.469766,0.000000,0.00000,4.636897e+07,0.000000e+00
4,2346308,3,Age and Sex,Clinics and Hospitals,afternoon,8929a565a03ffff,0.347431,0.000000,0.00000,4.636897e+07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...
305654,1416337,2,Age and Sex,Restaurants,afternoon,892a103b067ffff,0.085901,0.000000,0.00000,3.411227e+07,0.000000e+00
305655,1418023,2,Age and Sex,Restaurants,afternoon,892a106e4b7ffff,0.104970,0.000000,0.00000,3.411227e+07,0.000000e+00
305656,816793,2,Age and Sex,Schools and pre-schools,afternoon,892a100c963ffff,0.053793,77.666667,4.17791,3.411227e+07,1.224753e-07
305657,816794,2,Age and Sex,Schools and pre-schools,afternoon,892a100c96bffff,0.053793,77.666667,4.17791,3.411227e+07,1.224753e-07


In [16]:
city_level_acc = acc_df_mrgd_fltrd.drop("id", axis=1).groupby(["cityid","poi_category"]).mean()
city_level_acc

Unnamed: 0_level_0,Unnamed: 1_level_0,accessibility,population,pop x acc,citypop,pop x acc / sum pop
cityid,poi_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Cinemas and Theaters,0.991684,174.128047,146.211786,6598785.0,2.2e-05
1,Clinics and Hospitals,0.871586,145.693476,106.83069,6598785.0,1.6e-05
1,Grocery stores and supermarkets,1.819306,134.051893,220.791168,6598785.0,3.3e-05
1,Restaurants,6.180736,131.200579,884.615385,6598785.0,0.000134
1,Schools and pre-schools,3.701012,122.857171,394.736842,6598785.0,6e-05
1,Vaccination centers,2.802883,129.314866,330.15051,6598785.0,5e-05
2,Cinemas and Theaters,0.49252,825.851157,404.345202,34112270.0,1.2e-05
2,Clinics and Hospitals,0.725608,740.681457,486.191024,34112270.0,1.4e-05
2,Grocery stores and supermarkets,6.918257,685.475355,2822.250789,34112270.0,8.3e-05
2,Restaurants,13.156869,668.355132,6863.378243,34112270.0,0.000201


In [17]:
city_mapper = {1:"Atlanta", 2:"New York", 3:"Los Angeles", 4:"Chicago", 5:"Dallas"}

city_level_acc = city_level_acc.reset_index()
city_level_acc["city"] = city_level_acc["cityid"].map(city_mapper)
city_level_acc

Unnamed: 0,cityid,poi_category,accessibility,population,pop x acc,citypop,pop x acc / sum pop,city
0,1,Cinemas and Theaters,0.991684,174.128047,146.211786,6598785.0,2.2e-05,Atlanta
1,1,Clinics and Hospitals,0.871586,145.693476,106.83069,6598785.0,1.6e-05,Atlanta
2,1,Grocery stores and supermarkets,1.819306,134.051893,220.791168,6598785.0,3.3e-05,Atlanta
3,1,Restaurants,6.180736,131.200579,884.615385,6598785.0,0.000134,Atlanta
4,1,Schools and pre-schools,3.701012,122.857171,394.736842,6598785.0,6e-05,Atlanta
5,1,Vaccination centers,2.802883,129.314866,330.15051,6598785.0,5e-05,Atlanta
6,2,Cinemas and Theaters,0.49252,825.851157,404.345202,34112270.0,1.2e-05,New York
7,2,Clinics and Hospitals,0.725608,740.681457,486.191024,34112270.0,1.4e-05,New York
8,2,Grocery stores and supermarkets,6.918257,685.475355,2822.250789,34112270.0,8.3e-05,New York
9,2,Restaurants,13.156869,668.355132,6863.378243,34112270.0,0.000201,New York


In [18]:
city_level_acc.to_csv("city_level_stats.csv", index=False)

## Machine Learning

In [19]:
race_filter = h3_demographic_df["categorytype"] == "Race"
city_filter = h3_demographic_df["cityid"] == 1
h3_demo_race = h3_demographic_df[race_filter & city_filter]
h3_demo_race

Unnamed: 0,cityid,categorytype,groupname,h3id,population,id
2566,1,Race,Two or more races,8944ccd829bffff,0.184466,1901211
2567,1,Race,Some other race,8944ccd829bffff,0.194175,1901212
2568,1,Race,Native Hawaiian and Other Pacific Islander,8944ccd829bffff,0.000000,1901213
2569,1,Race,Asian,8944ccd829bffff,0.572816,1901214
2570,1,Race,American Indian and Alaska Native,8944ccd829bffff,0.106796,1901215
...,...,...,...,...,...,...
3415218,1,Race,Native Hawaiian and Other Pacific Islander,8944c1383b7ffff,0.000000,1893888
3415219,1,Race,Asian,8944c1383b7ffff,24.863636,1893889
3415220,1,Race,American Indian and Alaska Native,8944c1383b7ffff,3.136364,1893890
3415221,1,Race,Black or African American,8944c1383b7ffff,81.772727,1893891


In [20]:
# finding the race with majority population in each H3 hexagon
idx = h3_demo_race.groupby(["h3id"])['population'].transform(max) == h3_demo_race['population']
machine_learning_df = h3_demo_race[idx][["h3id", "groupname"]]
machine_learning_df

Unnamed: 0,h3id,groupname
2572,8944ccd829bffff,White
2597,8944ccd91a3ffff,White
2622,8944c123527ffff,White
2647,8944c12318fffff,White
2672,8944c13219bffff,White
...,...,...
3415121,8944c139d93ffff,Black or African American
3415146,8944c1b8267ffff,Black or African American
3415171,8944c1b836bffff,Black or African American
3415196,8944c106a5bffff,Black or African American


In [21]:
machine_learning_df = pd.merge(machine_learning_df, acc_df, how='inner', on = 'h3id')
category_filter = machine_learning_df["categorytype"] == "Race"
poi_filter = machine_learning_df["poi_category"] == "Vaccination centers"
time_filter = machine_learning_df["timeofday"] == "afternoon"

machine_learning_df = machine_learning_df[category_filter & poi_filter & time_filter]
machine_learning_df

Unnamed: 0,h3id,groupname,id,cityid,categorytype,poi_category,timeofday,accessibility
20,8944c12318fffff,White,482405,1,Race,Vaccination centers,afternoon,8.478673
65,8944c13219bffff,White,483699,1,Race,Vaccination centers,afternoon,2.988326
150,8944ccd824bffff,White,491869,1,Race,Vaccination centers,afternoon,2.566048
165,8944c12318bffff,White,482404,1,Race,Vaccination centers,afternoon,8.478673
210,8944ccd952fffff,White,492052,1,Race,Vaccination centers,afternoon,4.794596
...,...,...,...,...,...,...,...,...
783530,8944c1b826bffff,Black or African American,490674,1,Race,Vaccination centers,afternoon,0.383625
783570,8944c139d93ffff,Black or African American,485302,1,Race,Vaccination centers,afternoon,1.440069
783585,8944c1b8267ffff,Black or African American,490673,1,Race,Vaccination centers,afternoon,0.383625
783625,8944c1b836bffff,Black or African American,490713,1,Race,Vaccination centers,afternoon,8.593345


In [4]:
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize

In [23]:
x_data = machine_learning_df[["accessibility"]]
y_data = machine_learning_df["groupname"]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state=614, train_size=0.8)

In [24]:
# building a Random Forest model to predict the race with majority population based on accessibility, for each H3 hexagon
rf_clf = RandomForestClassifier(random_state=614)
rf_clf.fit(x_train,y_train)
y_predict_train = rf_clf.predict(x_train)
y_predict_test = rf_clf.predict(x_test)



In [25]:
train_accuracy = accuracy_score(y_train,y_predict_train)
train_accuracy

0.9047852520101981

In [26]:
test_accuracy = accuracy_score(y_test,y_predict_test)
test_accuracy

0.8266666666666667

In [27]:
print(classification_report(y_train, y_predict_train))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                            precision    recall  f1-score   support

         American Indian and Alaska Native       0.19      0.24      0.21        67
                                     Asian       0.77      0.35      0.48       179
                 Black or African American       0.93      0.93      0.93      5275
Native Hawaiian and Other Pacific Islander       0.00      0.00      0.00        54
                           Some other race       0.63      0.35      0.45       125
                         Two or more races       0.38      0.25      0.30        73
                                     White       0.91      0.94      0.92      4425

                                  accuracy                           0.90     10198
                                 macro avg       0.54      0.44      0.47     10198
                              weighted avg       0.90      0.90      0.90     10198



  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
print(classification_report(y_test, y_predict_test))

                                            precision    recall  f1-score   support

         American Indian and Alaska Native       0.00      0.00      0.00        10
                                     Asian       0.35      0.16      0.22        37
                 Black or African American       0.86      0.85      0.86      1320
Native Hawaiian and Other Pacific Islander       0.00      0.00      0.00        11
                           Some other race       0.38      0.11      0.17        45
                         Two or more races       0.00      0.00      0.00        18
                                     White       0.83      0.87      0.85      1109

                                  accuracy                           0.83      2550
                                 macro avg       0.35      0.29      0.30      2550
                              weighted avg       0.82      0.83      0.82      2550



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
