<h1>Transform and Rescale</h1>
<p>This notebook focusses on transforming and scaling the predictor data to prevent issues of skewed data from biasing results in later analysis. This notebook produces two datasets that can be used and reloaded in the Random Forests and this file should only be re-run if the transformation used for the scoring data changes.</p>
<p>Robust standardization using median and Interquartile Range</p>

In [None]:
import pandas as pd
# pd.options.mode.chained_assignment = None  # default='warn'
import os
import re
import glob
import numpy as np
from functools import reduce
import seaborn as sns

from sklearn import decomposition  
from sklearn.preprocessing import scale  
from sklearn import preprocessing 
from sklearn import linear_model
from sklearn import model_selection
#from sklearn import cross_validation

from scipy.stats import boxcox
from scipy.stats import spearmanr
from scipy.stats import pearsonr

# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

# Needed on a Mac
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt 

In [None]:
def load_status_scores(dtype):
    status = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/04-Neighborhood Scores/scores' + to_use + '.csv', index_col=0)  # SES scores
    
    status.dropna(inplace = True)
    
    # Scores
    status.drop(['RANK_10','RANK_19'], axis=1, inplace=True)
    status.rename(columns={
        'SES_10':'SES 2010',
        'SES_19':'SES 2019',
        'SES_ASC':'SES Ascent 2010-2019',
        'SES_PR_10':'SES 2010 Percentile', # 99 = High-status
        'SES_PR_19':'SES 2019 Percentile', # 99 = High-status
        'SES_PR_ASC':'SES Percentile Ascent 2010-2019'
    }, inplace=True)
    return status

def load_predictors(dtype):
    
    return status

<h2>Choose Your Transformation</h2>
<p>It should be easy to load and reload data once the transformation is changed. Use same transformation that was applied in the scoring notebook.</p>

In [None]:
to_use = 'Untransformed' # Options are: ['Untransformed','Box-Cox','Log']

SES = load_status_scores(to_use)  # SES scores in 2011

d10input = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/04-Neighborhood Scores/inputs2010' + to_use + '.csv', index_col=0)  # SES inputs
d19input = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/04-Neighborhood Scores/inputs2019' + to_use + '.csv', index_col=0)  # SES inputs

# Rename to remove confusion
d10input.rename(columns=lambda x: re.sub(' 2010','',x), inplace=True)
d19input.rename(columns=lambda x: re.sub(' 2019','',x), inplace=True)

In [None]:
#  Read in processed datasets
d10 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2010.csv', index_col=0)  #  Main dataset for 2001
d19 = pd.read_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/02-Cleaned Predictor Data/predictor2019.csv', index_col=0)  #  Main dataset for 2011

d10 = pd.merge(d10input, d10, how='inner', left_index=True, right_index=True)
d19 = pd.merge(d19input, d19, how='inner', left_index=True, right_index=True)

In [None]:
print("Have " + str(len(d10.columns)+1) + " variables to work with.")
d10.sample(3, random_state=r_state)

In [None]:
# Sanity check
s10 = set(d10.columns)
s19 = set(d19.columns)
print("2010 vs 2019 variable check: " + str(s10.difference(s19)))
print("2010 vs 2009 variable check: " + str(s19.difference(s10)))

In [None]:
SES.describe()

In [None]:
descriptives = pd.DataFrame()
for c in d10.columns:
    descriptives = descriptives.append(pd.concat([d10[c].describe(),d19[c].describe()],axis=0,ignore_index=True),ignore_index=False)

descriptives.columns = ['2010 Count','2010 Mean','2010 StD','2010 Min','2010 LQ','2010 Median','2010 UQ','2010 Max',
                        '2019 Count','2019 Mean','2019 StD','2019 Min','2019 LQ','2019 Median','2019 UQ','2019 Max']

In [None]:
descriptives.head()

In [None]:
# This enables to re-use the same sample below
dsample = descriptives.sample(4, random_state=r_state).index.values
dsample = np.append(dsample,
                    ['geoid','House Prices',
                     'Percentage with Bach Degree','Percentage Professional Workers',
                     'Household Income', 'Contract Rent'])

In [None]:
descriptives[descriptives.index.isin(dsample)][
    ['2010 Min','2019 Min','2010 Max','2019 Max','2010 Median','2019 Median']
]

In [None]:
descriptives.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/10-Summary Stats/Full Dataset ' + to_use + ' Descriptives.csv', index=True)

<h2>Rescaling Data</h2>
<p>The code below uses unit variance scaling on the 2010 and 2019 data. Both datasets are centered independently using median-removal.</p>

In [None]:
# Robust scaling _without_ centering
# and _with_ common scaling. We do this 
# because 2010 and 2019 won't have the 
# same centre but we do want them to use
# a common scale.
rs1 = preprocessing.RobustScaler(with_centering=False, quantile_range=(25.0,75.0))

#  Train on 2010 data set
rs1.fit(d10)

# Apply the same unit variance scaling to both years
d10_trs1 = pd.DataFrame(data=rs1.transform(d10), index=d10.index, columns=d10.columns)
d19_trs1 = pd.DataFrame(data=rs1.transform(d19), index=d19.index, columns=d19.columns)

# Create new robust scaler for centering 
# _without_ common scaling.
rs2 = preprocessing.RobustScaler(with_scaling=False)  

# Centre independently
d10_trs2 = pd.DataFrame(data=rs2.fit_transform(d10_trs1), index=d10.index, columns=d10.columns)  
d19_trs2 = pd.DataFrame(data=rs2.fit_transform(d19_trs1), index=d19.index, columns=d19.columns)

d10_trs2 = pd.merge(d10_trs2, SES, how='inner', left_index=True, right_index=True)
d10_trs2.drop(['SES 2010','SES 2019', 'SES Ascent 2010-2019', 'SES 2010 Percentile', 'SES 2019 Percentile', 'SES Percentile Ascent 2010-2019'], axis=1, inplace=True)

d19_trs2 = pd.merge(d19_trs2, SES, how='inner', left_index=True, right_index=True)
d19_trs2.drop(['SES 2010','SES 2019', 'SES Ascent 2010-2019', 'SES 2010 Percentile', 'SES 2019 Percentile', 'SES Percentile Ascent 2010-2019'], axis=1, inplace=True)

#  Write the transformed data to csv
d10_trs2.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/05-Transformed and Scaled Data/TransformedAndScaled2010' + to_use + '.csv', index=True)
d19_trs2.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/05-Transformed and Scaled Data/TransformedAndScaled2019' + to_use + '.csv', index=True) 

print("Done.")

<h2>Check that We Did Everything Correctly</h2>

In [None]:
descriptives_trs1 = pd.DataFrame()
for c in d10_trs1.columns:
    descriptives_trs1 = descriptives_trs1.append(pd.concat([d10_trs1[c].describe(),d19_trs1[c].describe()],axis=0,ignore_index=True),ignore_index=False)

descriptives_trs1.columns = ['2010 Count','2010 Mean','2010 StD','2010 Min','2010 LQ','2010 Median','2010 UQ','2010 Max',
                             '2019 Count','2019 Mean','2019 StD','2019 Min','2019 LQ','2019 Median','2019 UQ','2019 Max']

# Useful, but time-consuming
#plot_checks(d01_trs1, dsample, 'First-transform')

descriptives_trs1[descriptives_trs1.index.isin(dsample)][
    ['2010 Min','2019 Min','2010 Max','2019 Max','2010 Median','2019 Median','2010 Mean','2019 Mean']
]

In [None]:
descriptives_trs2 = pd.DataFrame()
for c in d10_trs2.columns:
    descriptives_trs2 = descriptives_trs2.append(pd.concat([d10_trs2[c].describe(),d19_trs2[c].describe()],axis=0,ignore_index=True),ignore_index=False)

descriptives_trs2.columns = ['2010 Count','2010 Mean','2010 StD','2010 Min','2010 LQ','2010 Median','2010 UQ','2010 Max',
                             '2019 Count','2019 Mean','2019 StD','2019 Min','2019 LQ','2019 Median','2019 UQ','2019 Max']

# Useful, but time-consuming
#plot_checks(d01_trs2, dsample, 'Second-transform')

descriptives_trs2[descriptives_trs2.index.isin(dsample)][
    ['2010 Min','2019 Min','2010 Max','2019 Max','2010 Median','2019 Median','2010 Mean','2019 Mean']
]

In [None]:
descriptives_trs2.to_csv('/Users/ritalaplante/Desktop/Thesis Data and Analytics/10-Summary Stats/Full Dataset Transformed ' + to_use + ' Descriptives.csv', index=True)