In [33]:
from __future__ import print_function

# Jupyter display
from IPython.display import display

# json
import json

# widgets
import ipywidgets as widgets
import bqplot as bq
import ipyleaflet as ll

# numerics
import pandas as pd
import numpy as np
from sklearn import linear_model

# colormap
import matplotlib as mpl
import matplotlib.cm
import matplotlib.colors

def n_colors(n, colormap=mpl.cm.Blues):
    data = np.linspace(0.0,1.0,n)
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(data)]
    return c

def data_to_colors(data, colormap=mpl.cm.plasma):
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(mpl.colors.Normalize()(data))]
    return c

In [34]:
data = pd.read_csv('./../analysis_data/merged_data.csv')

In [35]:
race_data = data[['hispanic', 'white',
       'black', 'american_indian', 'asian', 'pac_islander', 'other_races', 'two_races']]

In [36]:
race_data = race_data / 100.

In [37]:
result = data['scores']

In [38]:
clf = linear_model.LinearRegression(fit_intercept=False)

In [39]:
clf.fit(race_data.values, result.values)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [40]:
clf.coef_

array([  8.38530429,   7.07632008,   6.57851644, -10.04788056,
        10.01388699, -11.73590829, -13.13261219,  23.85508512])

In [41]:
clf.intercept_

0.0

In [42]:
race_corrected_scores = result.values - clf.predict(race_data.values)

In [43]:
race_corrected_scores

array([-0.15206205, -0.69064013, -0.18027085, -0.01330223,  0.85258752,
       -0.39227153,  0.83194795,  0.11443674, -0.15922666,  0.09216956,
        0.06545359,  0.26315672,  0.28330485, -0.78189684, -0.41183177,
       -0.22440185, -0.11750368,  0.18533339, -0.17559896,  1.12529616,
       -0.73934585,  0.1966144 ,  0.77054475, -0.23835278, -0.50595404])

In [44]:
race_corrected_data = data.copy(deep=True)

In [45]:
race_corrected_data.rename(columns={'Unnamed: 0': 'zipcode'}, inplace=True)

In [46]:
race_corrected_data['scores'] = race_corrected_scores

In [47]:
race_corrected_data.to_csv('./../analysis_data/race_corrected_merged_data.csv')

In [48]:
race_corrected_data.corr().ix[:, -1]

zipcode               -0.051254
pct_bachelors         -0.029507
labor_part_rate        0.183081
pct_welfare            0.309800
low_pov_idx           -0.079242
labor_idx             -0.048025
env_health_idx        -0.029866
hispanic               0.000236
white                  0.000383
black                  0.000134
american_indian        0.000118
asian                  0.000279
pac_islander           0.000200
other_races            0.000171
two_races              0.000394
housing_cost          -0.398101
transportation_cost   -0.074752
pub_school_score       0.095970
pr_school_score       -0.137316
rest_score            -0.019227
rest_proximity         0.125377
crime_index            0.017814
scores                 1.000000
Name: scores, dtype: float64

## TEST

In [49]:
other_race_data = data[['hispanic',
       'black', 'american_indian', 'asian', 'pac_islander', 'other_races', 'two_races']]
white_data = data[['white']]

In [50]:
other_race_data = other_race_data / 100.
white_data = white_data / 100.

In [51]:
other_races = other_race_data.sum(axis=1)

In [52]:
clf2 = linear_model.LinearRegression(fit_intercept=False)

In [53]:
clf.fit(np.array([other_races.values, white_data.values.flatten()]).T, result.values)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [54]:
other_race_corrected_scores = result.values - \
            clf.predict(np.array([other_races.values, white_data.values.flatten()]).T)

In [55]:
other_race_corrected_scores

array([-0.06332415, -1.05569744, -0.19960916, -0.29655952,  1.04413036,
       -0.34040153,  0.68306015, -0.09680889, -0.20574277,  0.03293298,
       -0.04815607,  0.56006077,  0.15608298, -0.48197786, -0.01794342,
        0.29164753, -0.33325534, -0.57185198,  0.04038037,  1.2511415 ,
       -0.63963772,  0.30781934,  0.97989203, -0.20567224, -0.79203941])

In [56]:
other_race_corrected_data = data.copy(deep=True)

In [57]:
other_race_corrected_data.rename(columns={'Unnamed: 0': 'zipcode'}, inplace=True)

In [58]:
other_race_corrected_data['scores'] = other_race_corrected_scores

In [59]:
race_corrected_data.corr().ix[:, -1]

zipcode               -0.051254
pct_bachelors         -0.029507
labor_part_rate        0.183081
pct_welfare            0.309800
low_pov_idx           -0.079242
labor_idx             -0.048025
env_health_idx        -0.029866
hispanic               0.000236
white                  0.000383
black                  0.000134
american_indian        0.000118
asian                  0.000279
pac_islander           0.000200
other_races            0.000171
two_races              0.000394
housing_cost          -0.398101
transportation_cost   -0.074752
pub_school_score       0.095970
pr_school_score       -0.137316
rest_score            -0.019227
rest_proximity         0.125377
crime_index            0.017814
scores                 1.000000
Name: scores, dtype: float64

In [60]:
other_race_corrected_data.corr().ix[:, -1]

zipcode               -0.021021
pct_bachelors         -0.011672
labor_part_rate        0.157599
pct_welfare            0.206008
low_pov_idx            0.021240
labor_idx              0.033953
env_health_idx         0.293346
hispanic              -0.220006
white                  0.000283
black                 -0.313187
american_indian       -0.081527
asian                  0.233603
pac_islander          -0.087019
other_races            0.009843
two_races              0.120511
housing_cost          -0.392875
transportation_cost   -0.030125
pub_school_score       0.042849
pr_school_score       -0.077970
rest_score            -0.380957
rest_proximity         0.095538
crime_index           -0.029469
scores                 1.000000
Name: scores, dtype: float64

In [61]:
other_race_corrected_data.to_csv('./../analysis_data/other_race_corrected_merged_data.csv')