<h1>Visualizing the Outputs</h1>

In [None]:
# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

import os
import re
import pandas as pd
import seaborn as sns

# Needed on a Mac
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap

<h2>Create Grid Styles</h2>

In [None]:
sns.set_style('whitegrid') # darkgrid, white grid, dark, white and ticks
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)# controls default text sizes
colors = sns.color_palette("Spectral", 10) #color palette

<h2>Load the Data for Neighborhood Scores</h2>

In [None]:
bos_tracts = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/Census_2010_Tracts.csv', index_col = 0)
bos_tracts.rename(columns = {'GEOID10':'geoid'}, inplace = True)

to_use = 'Untransformed'
alg = 'RF'

scores_all = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/07-Neighborhood Predictions/Predicted'+to_use+alg+'.csv', index_col=0)
scores_bos = pd.merge(scores_all, bos_tracts, on='geoid', how='inner')

In [None]:
#scores_all.head()
from scipy.stats import pearsonr
corr_asc_all, _ = pearsonr(scores_all['SES Ascent 2010-2019'], scores_all['SES Ascent 2010-2019 (Predicted)'])
corr_asc_bos, _ = pearsonr(scores_bos['SES Ascent 2010-2019'], scores_bos['SES Ascent 2010-2019 (Predicted)'])

print('Correlation Ascent All:', corr_asc_all)
print('Correlation Ascent Boston:', corr_asc_bos)

In [None]:
#scores_all.head()
corr_score_all, _ = pearsonr(scores_all['SES 2019'], scores_all['SES 2019 (Predicted)'])
corr_score_bos, _ = pearsonr(scores_bos['SES 2019'], scores_bos['SES 2019 (Predicted)'])

print('Correlation Scores All:', corr_score_all)
print('Correlation Scores Boston:', corr_score_bos)

<h2>Score Distributions in 2010, 2019, 2028</h2>

<h3>Density Line Graphs</h3>

In [None]:
fig, ax = plt.subplots(3,2, figsize=(15, 8), tight_layout=True)

#2010
fig_gb_1 = sns.kdeplot(scores_all['SES 2010'], shade=False, color = colors[0], lw = 2.5, ax = ax[0][0])
fig_bos_1 = sns.kdeplot(scores_bos['SES 2010'], shade=False, color = colors[0], lw = 2.5, ax = ax[0][1])

fig_gb_1.set_xlabel('Neighborhood Score')
fig_bos_1.set_xlabel('Neighborhood Score')

ax[0][0].set_title('Neighborhood Scores 2010 (Greater Boston)', weight = 'bold', pad = 15)
ax[0][1].set_title('Neighborhood Scores 2010 (Boston)', weight = 'bold', pad = 15)

#2019
fig_gb_2 = sns.kdeplot(scores_all['SES 2019'], shade=False, color = colors[7], lw = 2.5, ax = ax[1][0])
fig_bos_2 = sns.kdeplot(scores_bos['SES 2019'], shade=False, color = colors[7], lw = 2.5, ax = ax[1][1])

fig_gb_2.set_xlabel('Neighborhood Score')
fig_bos_2.set_xlabel('Neighborhood Score')

ax[1][0].set_title('Neighborhood Scores 2019 (Greater Boston)', weight = 'bold', pad = 15)
ax[1][1].set_title('Neighborhood Scores 2019 (Boston)', weight = 'bold', pad = 15)

#2028
fig_gb_2 = sns.kdeplot(scores_all['SES 2028 (Predicted)'], shade=False, color = colors[9], lw = 2.5, ax = ax[2][0])
fig_bos_2 = sns.kdeplot(scores_bos['SES 2028 (Predicted)'], shade=False, color = colors[9], lw = 2.5, ax = ax[2][1])

fig_gb_2.set_xlabel('Neighborhood Score')
fig_bos_2.set_xlabel('Neighborhood Score')

ax[2][0].set_title('Neighborhood Scores 2028 (Greater Boston)', weight = 'bold', pad = 15)
ax[2][1].set_title('Neighborhood Scores 2028 (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Distributions/dist_separatePlots.png",dpi=300)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 5), tight_layout=True)

#Greater Boston
fig_gb = sns.kdeplot(scores_all['SES 2010'], shade=False, color = colors[0], lw = 2.5, label='2010', ax = ax[0])
fig_gb = sns.kdeplot(scores_all['SES 2019'], shade=False, color=colors[7], lw = 2.5, label = '2019', ax = ax[0])
fig_gb = sns.kdeplot(scores_all['SES 2028 (Predicted)'], shade=False, color=colors[9], lw = 2.5, label = '2028 (Predicted)', ax = ax[0])

fig_gb.legend(title='Year', title_fontsize='13', loc='upper right', fancybox = True, shadow = True)
fig_gb.set_xlabel('Neighborhood Score')

#Boston
fig_bos = sns.kdeplot(scores_bos['SES 2010'], shade=False, color = colors[0], lw = 2.5, label='2010', ax = ax[1])
fig_bos = sns.kdeplot(scores_bos['SES 2019'], shade=False, color=colors[7], lw = 2.5, label = '2019', ax = ax[1])
fig_bos = sns.kdeplot(scores_bos['SES 2028 (Predicted)'], shade=False, color=colors[9], lw = 2.5, label = '2028 (Predicted)', ax = ax[1])

fig_bos.legend(title='Year', title_fontsize='13', loc='upper right', fancybox = True, shadow = True)
fig_bos.set_xlabel('Neighborhood Score')

ax[0].set_title('Neighborhood Scores Over Time (Greater Boston)', weight = 'bold', pad = 15)
ax[1].set_title('Neighborhood Scores Over Time (Boston)', weight = 'bold', pad = 15)

ax[0].set_xlim([-5, 10])
ax[1].set_xlim([-5, 10])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Distributions/dist_onePlot.png",dpi=300)

<h2>Neighborhood Percentile Ascent 2010-2019 and 2019-2018</h2>

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15, 8), tight_layout=True)

#2010-2019
fig_gb_1 = sns.kdeplot(scores_all['SES Percentile Ascent 2010-2019'], shade=False, color = colors[0], lw = 2.5, ax = ax[0][0])
fig_bos_1 = sns.kdeplot(scores_bos['SES Percentile Ascent 2010-2019'], shade=False, color = colors[0], lw = 2.5, ax = ax[0][1])

#fig_gb.legend(title='Year', title_fontsize='13', loc='upper right', fancybox = True, shadow = True)
fig_gb_1.set_xlabel('Score Percentile Ascent')
fig_bos_1.set_xlabel('Score Percentile Ascent')

ax[0][0].set_title('Percentile Change in Score 2010-2019 (Greater Boston)', weight = 'bold', pad = 15)
ax[0][1].set_title('Percentile Change in Score 2019-2028 (Boston)', weight = 'bold', pad = 15)

#2010-2019
fig_gb_2 = sns.kdeplot(scores_all['SES Percentile Ascent 2019-2028'], shade=False, color = colors[9], lw = 2.5, ax = ax[1][0])
fig_bos_2 = sns.kdeplot(scores_bos['SES Percentile Ascent 2019-2028'], shade=False, color = colors[9], lw = 2.5, ax = ax[1][1])

fig_gb_2.set_xlabel('Score Percentile Ascent')
fig_bos_2.set_xlabel('Score Percentile Ascent')

ax[1][0].set_title('Percentile Change in Score 2019-2028 (Greater Boston)', weight = 'bold', pad = 15)
ax[1][1].set_title('Percentile Change in Score 2019-2028 (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Distributions/pr_change_separatePlots.png",dpi=300)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

#Greater Boston
fig_gb = sns.kdeplot(scores_all['SES Percentile Ascent 2010-2019'], shade=False, color = colors[0], lw = 2.5, label='2010-2019', ax = ax[0])
fig_gb = sns.kdeplot(scores_all['SES Percentile Ascent 2019-2028'], shade=False, color=colors[9], lw = 2.5, label = '2019-2028', ax = ax[0])

fig_gb.legend(title='Year', title_fontsize='13', loc='upper right', fancybox = True, shadow = True)
fig_gb.set_xlabel('Score Percentile Ascent')

#Boston
fig_bos = sns.kdeplot(scores_bos['SES Percentile Ascent 2010-2019'], shade=False, color = colors[0], lw = 2.5, label='2010-2019', ax = ax[1])
fig_bos = sns.kdeplot(scores_bos['SES Percentile Ascent 2019-2028'], shade=False, color=colors[9], lw = 2.5, label = '2019-2028', ax = ax[1])

fig_bos.legend(title='Year', title_fontsize='13', loc='upper right', fancybox = True, shadow = True)
fig_bos.set_xlabel('Score Percentile Ascent')

ax[0].set_title('Neighborhood Score Percentile Ascent (Greater Boston)', weight = 'bold', pad = 15)
ax[1].set_title('Neighborhood Score Percentile Ascent (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Distributions/pr_change_onePlot.png",dpi=300)

<h2>Neighborhood Score Changes 2010-2019 Against 2019-2028</h2>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_gb = sns.regplot(x="SES Ascent 2010-2019",y="SES Ascent 2019-2028 (Predicted)", data=scores_all, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[0])
fig_bos = sns.regplot(x="SES Ascent 2010-2019",y="SES Ascent 2019-2028 (Predicted)", data=scores_bos, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[1])

fig_gb.set_xlabel('Score Ascent 2010-2019')
fig_bos.set_xlabel('Score Ascent 2010-2019')

fig_gb.set_ylabel('Score Ascent 2019-2028 (Predicted)')
fig_bos.set_ylabel('Score Ascent 2019-2028 (Predicted)')

ax[0].set_xlim([-0.5, 3.5])
ax[0].set_ylim([-0.5, 3.5])

ax[1].set_xlim([-0.5, 3.5])
ax[1].set_ylim([-0.5, 3.5])

ax[0].set_title('Changes in Neighborhood Ascent (Greater Boston)', weight = 'bold', pad = 15)
ax[1].set_title('Changes in Neighborhood Ascent (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Distributions/ascent_change.png",dpi=300)

<h2>Random Forest Prediction Accuracy</h2>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_gb = sns.regplot(x="SES Ascent 2010-2019",y="SES Ascent 2010-2019 (Predicted)", data=scores_all, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[0])
fig_bos = sns.regplot(x="SES Ascent 2010-2019",y="SES Ascent 2010-2019 (Predicted)", data=scores_bos, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[1])

fig_gb.set_xlabel('Score Ascent 2010-2019 (Actual)')
fig_bos.set_xlabel('Score Ascent 2010-2019 (Actual)')

fig_gb.set_ylabel('Score Ascent 2010-2019 (Predicted)')
fig_bos.set_ylabel('Score Ascent 2010-2019 (Predicted)')

ax[0].set_xlim([-0.5, 3.5])
ax[0].set_ylim([-0.5, 3.5])

ax[1].set_xlim([-0.5, 3.5])
ax[1].set_ylim([-0.5, 3.5])

ax[0].set_title('Neighborhood Ascent Actual vs. Predicted (Greater Boston)', weight = 'bold', pad = 15)
ax[1].set_title('Neighborhood Ascent Actual vs. Predicted (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/actual_vs_predicted_ascent"+alg+".png",dpi=300)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_gb = sns.regplot(x="SES 2019",y="SES 2019 (Predicted)", data=scores_all, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[0])
fig_bos = sns.regplot(x="SES 2019",y="SES 2019 (Predicted)", data=scores_bos, scatter_kws={"color": colors[9]}, line_kws={"color": colors[0]}, ax = ax[1])

fig_gb.set_xlabel('Score 2019 (Actual)')
fig_bos.set_xlabel('Score 2019 (Actual)')

fig_gb.set_ylabel('Score 2019 (Predicted)')
fig_bos.set_ylabel('Score 2019 (Predicted)')


ax[0].set_title('Neighborhood Score Actual vs. Predicted (Greater Boston)', weight = 'bold', pad = 15)
ax[1].set_title('Neighborhood Score Actual vs. Predicted (Boston)', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/actual_vs_predicted_score"+alg+".png",dpi=300)

<h2>Feature Importance</h2>

In [None]:
to_use = 'Untransformed'
alg = 'RF'

feature_imp = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/07-Neighborhood Predictions/FeatureImportance'+to_use+alg+'.csv', index_col=0)
feature_desc = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/Variable Descriptions.csv')

feature_desc.rename(columns = {'Variable Name':'feature'}, inplace = True)
features = pd.merge(feature_imp, feature_desc, on='feature', how='inner')

features['feature'].replace({'House Prices': 'Value Owner Occuped Units'}, inplace = True)

top_4 = features.nlargest(4, 'importance')
top_5 = features.nlargest(5, 'importance')
top_10 = features.nlargest(10, 'importance')

top_5_feat = top_5['feature']
top_4_feat = top_4['feature']
top_10_feat = top_10['feature']

importance = features['importance']
var_cat = features['Variable Category']

In [None]:
features.head(10)

<h3>Top 5 Extremely Random Trees</h3>

In [None]:
plt.figure(figsize=(15,6), tight_layout=True)
fig = sns.scatterplot(data=features, x='Variable Category', y='importance', color = 'black', s=60)

fig.set(xlabel='Variable Group', ylabel='Variable Importance')

fig = sns.scatterplot(data=top_5, x='Variable Category', y='importance', color = colors[0], s=85)

for i, feat in enumerate (top_4_feat):
    plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    
for i, feat in enumerate (top_5_feat):
    if i == 4:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.004), ha = 'left')
        
fig.set_title('Variable Importance in Extremely Random Trees Regression', weight = 'bold', pad = 15)

fig.set_ylim([0.0, 0.12])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/feat_importance_5_ETR.png",dpi=300)

<h3>Top 10 Features Extremely Random Trees</h3>

In [None]:
plt.figure(figsize=(15,6), tight_layout=True)
fig = sns.scatterplot(data=features, x='Variable Category', y='importance', color = 'black', s=60)

fig.set(xlabel='Variable Group', ylabel='Variable Importance')

fig = sns.scatterplot(data=top_10, x='Variable Category', y='importance', color = colors[0], s=85)

for i, feat in enumerate (top_4_feat):
    plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    
for i, feat in enumerate (top_5_feat):
    if i == 4:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.004), ha = 'left')
        
for i, feat in enumerate (top_10_feat):
    if i > 4 and i != 8:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    if i == 8:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.002555), ha = 'left') 
    
fig.set_title('Variable Importance in Extremely Random Trees Regression', weight = 'bold', pad = 15)

fig.set_ylim([0.0, 0.12])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/feat_importance_10_ETR.png",dpi=300)

<h3>Top 5 Traditional Random Forests</h3>

In [None]:
plt.figure(figsize=(15,6), tight_layout=True)
fig = sns.scatterplot(data=features, x='Variable Category', y='importance', color = 'black', s=60)

fig.set(xlabel='Variable Group', ylabel='Variable Importance')

fig = sns.scatterplot(data=top_5, x='Variable Category', y='importance', color = colors[0], s=85)

for i, feat in enumerate (top_4_feat):
    plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    
for i, feat in enumerate (top_5_feat):
    if i == 4:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.006), ha = 'left')
        
fig.set_title('Variable Importance in Random Forest Regression', weight = 'bold', pad = 15)

fig.set_ylim([0.0, 0.16])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/feat_importance_5_RF.png",dpi=300)

<h3>Top 10 Traditional Random Forests</h3>

In [None]:
plt.figure(figsize=(15,6), tight_layout=True)
fig = sns.scatterplot(data=features, x='Variable Category', y='importance', color = 'black', s=60)

fig.set(xlabel='Variable Group', ylabel='Variable Importance')

fig = sns.scatterplot(data=top_10, x='Variable Category', y='importance', color = colors[0], s=85)

for i, feat in enumerate (top_4_feat):
    plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    
for i, feat in enumerate (top_5_feat):
    if i == 4:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.006), ha = 'left')
        
for i, feat in enumerate (top_10_feat):
    if i > 4 and i != 8:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]+0.0027), ha = 'left')
    if i == 8:
        plt.annotate(feat, xy = (var_cat[i], importance[i]), xytext = (var_cat[i], importance[i]-0.006), ha = 'left') 
    
fig.set_title('Variable Importance in Random Forest Regression', weight = 'bold', pad = 15)

fig.set_ylim([0.0, 0.16])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Prediction Results/feat_importance_10_RF.png",dpi=300)

<h2>Distibutions of Predictor Data</h2>

In [None]:
color_list = ['#3682ba', '#fff1a8', '#97d5a4', '#ee6445', '#f4faad', '#5cb7aa', '#d0384e', '#d1ed9c', '#fa9b58']

In [None]:
data_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2010.csv', index_col=0)
data_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2019.csv', index_col=0)

feature_desc = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/Variable Descriptions.csv')

In [None]:
#Race
race_df = feature_desc[feature_desc['Variable Category'] == 'Race'] 
race = race_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

race_data_2010 = df_2010.filter(items = race)
race_data_2019 = df_2019.filter(items = race)

#Age
age_df = feature_desc[feature_desc['Variable Category'] == 'Age'] 
age = age_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

age_data_2010 = df_2010.filter(items = age)
age_data_2019 = df_2019.filter(items = age)

#Housing Tenure
ht_df = feature_desc[feature_desc['Variable Category'] == 'Housing Tenure'] 
ht = ht_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

ht_data_2010 = df_2010.filter(items = ht)
ht_data_2019 = df_2019.filter(items = ht)

#Hours Worked
hw_df = feature_desc[feature_desc['Variable Category'] == 'Hours Worked'] 
hw = hw_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

hw_data_2010 = df_2010.filter(items = hw)
hw_data_2019 = df_2019.filter(items = hw)

#Citizenship
cit_df = feature_desc[feature_desc['Variable Category'] == 'Citizenship'] 
cit = cit_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

cit_data_2010 = df_2010.filter(items = cit)
cit_data_2019 = df_2019.filter(items = cit)

#Marital Status
ms_df = feature_desc[feature_desc['Variable Category'] == 'Marital Status'] 
ms = ms_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

ms_data_2010 = df_2010.filter(items = ms)
ms_data_2019 = df_2019.filter(items = ms)

#Transportation
trans_df = feature_desc[feature_desc['Variable Category'] == 'Transportation'] 
trans = trans_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

trans_data_2010 = df_2010.filter(items = trans)
trans_data_2019 = df_2019.filter(items = trans)

#Buidling Age
ba_df = feature_desc[feature_desc['Variable Category'] == 'Building Age'] 
ba = ba_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

ba_data_2010 = df_2010.filter(items = ba)
ba_data_2019 = df_2019.filter(items = ba)

#Building Structure
bs_df = feature_desc[feature_desc['Variable Category'] == 'Building Structure'] 
bs = bs_df['Variable Name'].values.tolist()
df_2010 = data_2010.copy()
df_2019 = data_2019.copy()

bs_data_2010 = df_2010.filter(items = bs)
bs_data_2019 = df_2019.filter(items = bs)

<h3>Race Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=race_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Race', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=race_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Race', ylabel='Share of Population')

ax[0].set_title('Distribution of Race 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Race 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/raceDist.png",dpi=300)

<h3>Age Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=age_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Age', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=age_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Age', ylabel='Share of Population')

ax[0].set_title('Distribution of Age 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Age 2019', weight = 'bold', pad = 15)

ax[1].set_ylim(top = 1.045)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/ageDist.png",dpi=300)

<h3>Housing Tenure Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=ht_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Housing Tenure', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=ht_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Housing Tenure', ylabel='Share of Population')

ax[0].set_title('Distribution of Housing Tenure 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Housing Tenure 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/tenureDist.png",dpi=300)

<h3>Distribution of Hours Worked Data</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=hw_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Average Hours Worked', ylabel='Hours Worked Per Week')

fig_2019 = sns.boxplot(data=hw_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Average Hours Worked', ylabel='Hours Worked Per Week')

ax[0].set_title('Distribution of Average Hours Worked 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Averaged Hours Worked 2019', weight = 'bold', pad = 15)

ax[1].set_ylim(top = 54.5)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/hwDist.png",dpi=300)

<h3>Citizenship Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=cit_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Citizenship', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=cit_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Citizenship', ylabel='Share of Population')

ax[0].set_title('Distribution of Citizenship Status 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Citizenship Status 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/citizenshipDist.png",dpi=300)

<h3>Marital Status Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=ms_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Marital Status', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=ms_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Marital Status', ylabel='Share of Population')

ax[0].set_title('Distribution of Marital Status 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Marital Status 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/marriageDist.png",dpi=300)

<h3>Transportation Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(25, 8), tight_layout=True)

fig_2010 = sns.boxplot(data=trans_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Transportation Type', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=trans_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Transportation Type', ylabel='Share of Population')

ax[0].set_title('Distribution of Mode of Transportation to Work 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Mode of Transportation to Work 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/transportationDist.png",dpi=300)

<h3>Building Age Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 8), tight_layout=True)

fig_2010 = sns.boxplot(data=ba_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Building Age', ylabel='Share of Buildings')

fig_2019 = sns.boxplot(data=ba_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Building Age', ylabel='Share of Buildings')

ax[0].set_title('Distribution of Untransformed Building Age 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Untransformed Building Age 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/baDist.png",dpi=300)

<h3>Building Structure Distribution</h3>

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20, 8), tight_layout=True)

fig_2010 = sns.boxplot(data=bs_data_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Structure Type', ylabel='Share of Buildings')

fig_2019 = sns.boxplot(data=bs_data_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Structure Type', ylabel='Share of Buildings')

ax[0].set_title('Distribution of Untransformed Building Structure 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Untransformed Building Structure 2019', weight = 'bold', pad = 15)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Predictor Variable Distributions/bsDist.png",dpi=300)

<h2>Predictor Data Distributions</h2>

In [None]:
cr_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/cr2010.csv', index_col=0)
cr_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/cr2019.csv', index_col=1)

edu_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/education2010.csv', index_col=0)
edu_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/education2019.csv', index_col=1)

inc_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/income2010.csv', index_col=0)
inc_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/income2019.csv', index_col=1)

occ_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/occupation2010.csv', index_col=0)
occ_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/occupation2019.csv', index_col=1)

value_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/value2010.csv', index_col=0)
value_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/03-Cleaned Scoring Data/value2019.csv', index_col=1)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=cr_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Contract Rent', ylabel='Median Contract Rent')

fig_2019 = sns.boxplot(data=cr_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Contract Rent', ylabel='Share of Population')

ax[0].set_title('Distribution of Median Contract Rent 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Median Contract Rent 2019', weight = 'bold', pad = 15)

ax[0].set_ylim([150, 3500])
ax[1].set_ylim([150, 3500])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Variable Distributions/crDist.png",dpi=300)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=inc_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Household Income', ylabel='Median Household Income')

fig_2019 = sns.boxplot(data=inc_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Household Income', ylabel='Median Household Income')

ax[0].set_title('Distribution of Median Household Income 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Median Household Income 2019', weight = 'bold', pad = 15)

ax[0].set_ylim([0, 255000])
ax[1].set_ylim([0, 255000])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Variable Distributions/hhiDist.png",dpi=300)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(17, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=value_2010, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Value', ylabel='Median Value (In Millions of Dollars)')

fig_2019 = sns.boxplot(data=value_2019, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Value', ylabel='Median Value (In Millions of Dollars)')

ax[0].set_title('Distribution of Median Value of Owner Occupied Units 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Median Value of Owner Occupied Units 2019', weight = 'bold', pad = 15)

ax[0].set_ylim([10000, 1850000])
ax[1].set_ylim([10000, 1850000])

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Variable Distributions/valueDist.png",dpi=300)

In [None]:
def process_edu_data(df):
    #  Columns of interest
    edu = ['doctorate','professional','masters','bachelors','highSchool', 'middleSchool', 'elementary', 'noSchool']
    
    edu_data = pd.DataFrame()
    
    edu_data['he_pct'] = (df.loc[:,edu[0:3]].sum(axis=1) / df.loc[:,'total (education)'])
    
    return edu_data

edu10 = process_edu_data(edu_2010) 
edu19 = process_edu_data(edu_2019)

fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=edu10, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Higly Educated Population', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=edu19, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Highly Educated Population', ylabel='Share of Population')

ax[0].set_title('Distribution of Education 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Eduation 2019', weight = 'bold', pad = 15)

ax[1].set_ylim(top = 0.8)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Variable Distributions/eduDist.png",dpi=300)

In [None]:
def process_occ_data(df):
    #  Columns of interest
    occ = ['top_occ','service','sales','nat_construction','production']
    
    occ_data = pd.DataFrame()
    
    # Add the 'knowledge worker' share -- this is columns 0-2 of the data frame
    occ_data['kw_pct'] = (df.loc[:,occ[0]] / df.loc[:,'total (occupation)'])
    
    return occ_data

occ10 = process_occ_data(occ_2010) 
occ19 = process_occ_data(occ_2019) 

fig, ax = plt.subplots(1,2, figsize=(15, 5), tight_layout=True)

fig_2010 = sns.boxplot(data=occ10, palette=color_list, linewidth=2.5, ax = ax[0])
fig_2010.set(xlabel='Workers in Top Occupations', ylabel='Share of Population')

fig_2019 = sns.boxplot(data=occ19, palette=color_list, linewidth=2.5, ax = ax[1])
fig_2019.set(xlabel='Workers in Top Occupations', ylabel='Share of Population')

ax[0].set_title('Distribution of Occupation 2010', weight = 'bold', pad = 15)
ax[1].set_title('Distribution of Occupation 2019', weight = 'bold', pad = 15)

ax[0].set_ylim(top = 1)

#plt.show()
plt.savefig("/Users/ritalaplante/Desktop/Thesis Data and Analytics/08-Plot Outputs/Score Variable Distributions/occDist.png",dpi=300)

In [None]:
df = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/05-Transformed and Scaled Data/TransformedAndScaled2010Untransformed.csv', index_col = 0)
scores = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/04-Neighborhood Scores/scoresUntransformed.csv', index_col = 0)
score = scores['SES_10']

df_full = pd.merge(score, df, on='geoid', how='inner')

plt.figure(figsize=(30,20), tight_layout=True)
# fig = sns.heatmap(df.corr(), cmap = 'BrBG')

mask = np.triu(np.ones_like(df_full.corr(), dtype=bool))
heatmap = sns.heatmap(df_full.corr(), mask=mask, vmin=-1, vmax=1)

plt.show()

<h2>Creating Additional Data Tables for Mapping</h2>

In [None]:
data_2010 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2010.csv', index_col = 0)
data_2019 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2019.csv', index_col = 0)

race_pop_2010 = data_2010.copy()
race_pop_2019 = data_2019.copy()

race_pop_2010 = race_pop_2010.filter(items=['white', 'asian'])
race_pop_2019 = race_pop_2019.filter(items=['white', 'asian'])

race_pop_2010['nonWhite2010'] = 1 - race_pop_2010['white']
race_pop_2019['nonWhite2019'] = 1 - race_pop_2019['white']

race_pop_2010.rename(columns={"white": "white2010", "asian": "asian2010"}, inplace = True)
race_pop_2019.rename(columns={"white": "white2019", "asian": "asian2019"}, inplace = True)

race_full = pd.merge(race_pop_2010, race_pop_2019, on='geoid', how='inner')
race_full['asianChange'] = race_full['asian2019'] - race_full['asian2010']
race_full['nonwhiteChange'] = race_full['nonWhite2019'] - race_full['nonWhite2010']
race_full['whiteChange'] = race_full['white2019'] - race_full['white2010']

age_pop_2010 = data_2010.copy()
age_pop_2019 = data_2019.copy()

age_pop_2010 = age_pop_2010.filter(items=['children'])
age_pop_2019 = age_pop_2019.filter(items=['children'])

age_pop_2010.rename(columns={"children": "children2010"}, inplace = True)
age_pop_2019.rename(columns={"children": "children2019"}, inplace = True)

age_full = pd.merge(age_pop_2010, age_pop_2019, on='geoid', how='inner')
age_full['ageChange'] = age_full['children2019'] - age_full['children2010']

In [None]:
race_full.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/11-Additional Demographic Data/raceChanges.csv', index = True)
age_full.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/11-Additional Demographic Data/ageChanges.csv', index = True)