In [1]:
from __future__ import print_function

# Jupyter display
from IPython.display import display

# json
import json

# widgets
import ipywidgets as widgets
import bqplot as bq
import ipyleaflet as ll

# numerics
import pandas as pd
import numpy as np

# colormap
import matplotlib as mpl
import matplotlib.cm
import matplotlib.colors

def n_colors(n, colormap=mpl.cm.Blues):
    data = np.linspace(0.0,1.0,n)
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(data)]
    return c

def data_to_colors(data, colormap=mpl.cm.plasma):
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(mpl.colors.Normalize()(data))]
    return c

# Correlation Matrix for the Scores

In [2]:
data = pd.read_csv('./../analysis_data/other_race_corrected_merged_data.csv')
data = data.ix[:, 2:]

corr_data = data.corr()

In [3]:
y_sc, col_sc = bq.OrdinalScale(reverse=True), bq.ColorScale(scheme='Reds')
ax_c = bq.ColorAxis(scale=col_sc, label='Correlations')
grid_map = bq.GridHeatMap(row=corr_data.columns.tolist(), color=corr_data, scales={'row': y_sc, 'color': col_sc})
ax_y = bq.Axis(scale=y_sc, orientation='vertical', side='right')

fig = bq.Figure(marks=[grid_map], axes=[ax_y, ax_c], padding_y=0.0, 
             fig_margin={'bottom': 60, 'left': 5, 'right': 110, 'top': 60})
display(fig)

# Satisfaction Score by ZIP Code

In [4]:
survey_map = ll.Map(center=[37.7749, -122.34580993652344], zoom=12, layout=widgets.Layout(height='450px'))

survey_data = pd.read_csv('./../analysis_data/scores_by_zipcodes.csv', header=None)
survey_colors = data_to_colors(survey_data.ix[:, 1], colormap=mpl.cm.Greens)
zips = [str(int(i)) for i in survey_data.ix[:, 0]]
cols = {}

for i in range(len(zips)):
    cols[zips[i]] = survey_colors[i]

# Survey layer
with open('./../mapdata/sf_zipcodes.geojson') as f:
    data = json.load(f)

for feature in data['features']:
    feature['properties']['style'] = {
        'color': cols[feature['id']],
        'weight': 1,
        'fillColor': cols[feature['id']],
        'fillOpacity': 0.65,
    }

survey_layer = ll.GeoJSON(data=data, hover_style={'fillColor': 'red'})
survey_map += survey_layer

with open('./../mapdata/bayarea_nosf_zipcodes.geojson') as f:
    bay_data = json.load(f)
    
# Grey layer
grey_colors = n_colors(len(bay_data['features']), colormap=mpl.cm.Greys)

for feature, color in zip(bay_data['features'], grey_colors):
    feature['properties']['style'] = {
        'color': 'grey',
        'weight': 1,
        'fillColor': 'grey',
        'fillOpacity': 0.2
    }

grey_layer = ll.GeoJSON(data=bay_data, hover_style={'fillColor': 'red'})

survey_map.add_layer(grey_layer)

survey_map

# Neighborhood features

In [5]:
feature_map = ll.Map(center=[37.7749, -122.34580993652344], zoom=12, layout=widgets.Layout(height='450px'))
tract_data = pd.read_csv('./../data/CensusData/sfo data/tract_data_normalized.csv')
colors = data_to_colors(tract_data['pct_bachelors'], colormap=mpl.cm.Reds)
tracts = [str(int(i)) for i in tract_data.ix[:, 0].values]

cols = {}
for i in range(len(tracts)):
    cols[tracts[i][4:]] = colors[i]

with open('./../mapdata/sf_tracts.geojson') as f:
    data = json.load(f)
for feature in data['features']:
    try:
        style_col = cols[feature['properties']['tractce10']]
    except KeyError:
        style_col = 'Grey'
    feature['properties']['style'] = {
        'color': style_col,
        'weight': 1,
        'fillColor': style_col,
        'fillOpacity': 0.75,
    }

feature_layer = ll.GeoJSON(data=data)
feature_map += feature_layer

data_columns = [
      'pct_bachelors',
      'labor_part_rate',
      'pct_welfare',
      'low_pov_idx',
      'labor_idx',
      'env_health_idx',
      'hispanic',
      'white',
      'black',
      'american_indian',
      'asian',
      'pac_islander',
      'other_races',
      'two_races',
      'housing_cost',
      'transportation_cost',
      'pub_school_score',
      'pr_school_score',
      'rest_score',
      'rest_proximity']

dpdown = widgets.Dropdown(options=data_columns, value='pct_bachelors')

def update_dpdown(change):
    value = change['new']
    colors = data_to_colors(tract_data[value], colormap=mpl.cm.Reds)
    cols = {}
    for i in range(len(tracts)):
        cols[tracts[i][4:]] = colors[i]
    for feature in data['features']:
        try:
            style_col = cols[feature['properties']['tractce10']]
        except KeyError:
            style_col = 'grey'
        feature['properties']['style'] = {
            'color': style_col,
            'weight': 1,
            'fillColor': style_col,
            'fillOpacity': 0.75,
        }
        
    feature_layer = ll.GeoJSON(data=data)
    
    feature_map.layers = [feature_map.layers[0], feature_layer]
    
dpdown.observe(update_dpdown, names=['value'])

feature_map.layout.align_self = 'stretch'
display(widgets.VBox([feature_map, dpdown]))

# From the Features to the Satisfaction Index

In [6]:
ordinal_features_scale = bq.OrdinalScale()
values_features_scale = bq.LinearScale()
value_axis = bq.Axis(scale=values_features_scale, orientation='vertical')

bars = bq.Bars(x=data_columns, y=np.zeros(len(data_columns)), 
               scales={
        'x': ordinal_features_scale,
        'y': values_features_scale
    })

indication = bq.Label(x=0.9, y=0.5, text='Hover On Map', font_size='50px', color='gray')

ui_map = ll.Map(center=[37.7449, -122.42580993652344], zoom=11, layout=widgets.Layout(width='450px', height='450px'))
ui_map.add_layer(survey_layer)
ui_map.add_layer(grey_layer)

predictors = pd.read_csv('./../analysis_data/predictors.csv').set_index('zip code')

def scores(zip_code):
    if zip_code in predictors.index:
        dic = predictors.ix[zip_code].to_dict()
        return [dic[v] for v in data_columns]
    else:
        return np.zeros(len(data_columns))

def hover_handler(event=None, id=None, properties=None):
    figure.marks = [bars]
    bars.y = scores(int(id))

survey_layer.on_hover(hover_handler)

figure = bq.Figure(axes=[value_axis], marks=[bars, indication],
                   title='Predictors',
                   animation_duration=500, min_width=300, min_height=500)


widgets.HBox([ui_map, figure])

#### Bar Chart with reduced data

In [7]:
ordinal_features_scale = bq.OrdinalScale()
values_features_scale = bq.LinearScale()
ord_axis = bq.Axis(scale=ordinal_features_scale)
value_axis = bq.Axis(scale=values_features_scale, orientation='vertical')

predictors = pd.read_csv('./../analysis_data/predictors.csv').set_index('zip code')
imp_predictors = predictors.loc[:, ('env_health_idx', 'labor_idx', 'housing_cost', 'transportation_cost', 'crime_index',
                                       'pub_school_score', 'rest_proximity')]
imp_predictors.columns=['Environment', 'Labor', 'Housing', 'Transport', 'Crime', 'Schools', 'Resta']
columns_reduced = imp_predictors.columns.values
mean_scores = imp_predictors.mean().values

bars = bq.Bars(x=columns_reduced, y=np.ones(len(columns_reduced)), 
               scales={
        'x': ordinal_features_scale,
        'y': values_features_scale
    }, base=1.0)


indication = bq.Label(x=0.9, y=0.5, text='Hover On Map', font_size='50px', color='gray')

ui_map = ll.Map(center=[37.7449, -122.42580993652344], zoom=11, layout=widgets.Layout(width='450px', height='450px'))
ui_map.add_layer(survey_layer)
ui_map.add_layer(grey_layer)


def scores(zip_code):
    if zip_code in imp_predictors.index:
        dic = imp_predictors.ix[zip_code].to_dict()
        return [dic[v] for v in columns_reduced]
    else:
        return np.ones(len(columns_reduced))

def hover_handler(event=None, id=None, properties=None):
    figure.marks = [bars]
    bars.y = scores(int(id)) / mean_scores

survey_layer.on_hover(hover_handler)

figure = bq.Figure(axes=[value_axis, ord_axis], marks=[bars, indication],
                   title='Predictors',
                   animation_duration=500, min_width=300, min_height=500)

widgets.HBox([ui_map, figure])

#### Importances

In [8]:
scores_data = survey_data.set_index(0)
joint_df = pd.concat([imp_predictors, scores_data], axis=1).dropna()
preds = joint_df.iloc[:, :-1].values
response = joint_df.iloc[:, -1].values

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [10]:
rf = RandomForestRegressor(max_features=1, min_samples_leaf=3, n_estimators=100, oob_score=True,
                           n_jobs=-1)

In [11]:
num_iter = 10
imps = np.zeros(preds.shape[1])
in_sample_scores = 0.
out_sample_scores = 0.

for it in range(num_iter):
    rf.fit(preds, response)
    imps += rf.feature_importances_
    in_sample_scores += rf.score(preds, response)
    out_sample_scores += rf.oob_score_

In [12]:
from bqplot import pyplot as pl

In [13]:
pl.figure(title='Importances of various factors for predicting the Satisfaction of a Neighborhood')
pl.bar(imp_predictors.columns.values.astype(str), imps)
pl.show()

### Race Correlated Data

### Feature Importances:

<b>

* We select a list of features like, Job Prospects, Crime Index, Environment Score, Housing and Transportation prices etc., 
which can be affected by the government. 

* We aim to find the features which are most important for predicting the satisfaction scores across San Francisco. We do this
by using `Random Forests`.

* Random Forests fit a bunch of trees to bootstrapped versions of the sample data and a better fit is obtained by making 
successive trees independent of each other. This is done by randomly selecting a subset of features at each node of the 
tree.

* To compute the `Feature Importances`, the reduction of the error due to each of the features is calculated and the importance
of each feature is inferred.
</b>

In [55]:
merged_data = pd.read_csv('./../analysis_data/other_race_corrected_merged_data.csv')
merged_data = merged_data.iloc[:, 1:]
merged_data = merged_data.set_index('zipcode')

corr_predictors = merged_data.iloc[:, :-1]
corr_predictors = corr_predictors.loc[:, ('env_health_idx', 'labor_idx', 'housing_cost', 'transportation_cost', 'crime_index',
                                       'pub_school_score', 'rest_proximity')].values

corr_response = merged_data.iloc[:, -1].values

In [82]:
rf = RandomForestRegressor(max_features=1, min_samples_leaf=3, n_estimators=100, oob_score=True,
                           n_jobs=-1)

In [83]:
num_iter = 10
imps = np.zeros(corr_predictors.shape[1])
in_sample_scores = 0.
out_sample_scores = 0.

for it in range(num_iter):
    rf.fit(corr_predictors, corr_response)
    imps += rf.feature_importances_
    in_sample_scores += rf.score(corr_predictors, corr_response)
    out_sample_scores += rf.oob_score_

In [84]:
print(in_sample_scores)
print(out_sample_scores)

4.4744944966
-0.123672233435


In [85]:
pl.figure(title='Importances of various factors for predicting the Satisfaction of a Neighborhood')
pl.bar(imp_predictors.columns.values.astype(str), imps)
pl.show()

### If we had two more hours:
<b>
 * Add analysis of where the government can get maximum increase in the satisfaction for a given amount of money spent, by analyzing how the satisfaction changes for each zip code.
 
* Improve the estimation of satisfaction index by having a weighted average across the different categories to measure well being. 
* Extend the analysis to other areas by gathering a measure of satisfaction from the residents.
</b>

In [None]:
### Conclusions

* 