In [3]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (AdaBoostRegressor, 
                              GradientBoostingRegressor,
                              RandomForestRegressor,
                              VotingRegressor)
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [4]:
df = pd.read_csv('../data/sam_dataset.csv')

## Feature Selection

In [None]:
df['pct_nonfamily_house'] = df['nonfamily_households'] / df['households']
# df['pct_family_house'] = df['family_households'] / df['households']
df['pct_male_male_households'] = df['male_male_households'] / df['households']
df['pct_female_female_households'] = df['female_female_households'] / df['households']
df['pct_male'] = df['male_pop'] / df['total_pop']
# df['pct_female'] = df['female_pop'] / df['total_pop']
df['pct_white'] = df['white_pop'] / df['total_pop']
df['pct_black'] = df['black_pop'] / df['total_pop']
df['pct_asian'] = df['asian_pop'] / df['total_pop']
df['pct_hispanic'] = df['hispanic_pop'] / df['total_pop']
df['pct_amerindian'] = df['amerindian_pop'] / df['total_pop']
df['pct_other_race'] = df['other_race_pop'] / df['total_pop']
df['pct_two_or_more_races'] = df['two_or_more_races_pop'] / df['total_pop']
df['pct_commuters_by_public_transportation'] = df['commuters_by_public_transportation'] / df['total_pop']
df['pct_households'] = df['households'] / df['total_pop']
df['pop_per_housing_unit'] = df['total_pop'] / df['housing_units'] 
df['pct_armed_forces'] = df['armed_forces'] / df['total_pop']
df['pct_employed'] = df['employed_pop'] / df['total_pop']
df['pct_bachelors_degree_or_higher_25_64'] = df['bachelors_degree_or_higher_25_64'] / df['total_pop']
df['pct_children'] = df['children'] / df['total_pop']
df['pct_employed_education_health_social'] = df['employed_education_health_social'] / df['total_pop']
df['pct_not_us_citizen_pop'] = df['not_us_citizen_pop'] / df['total_pop']
# df['pct_not_hispanic'] = df['not_hispanic_pop'] / df['total_pop']

In [None]:
features = ['poverty_rate',
#             'gini_index',
#             'pct_nonfamily_house',
            'pct_male',
            'pct_black',
            'pct_asian',
            'pct_hispanic',
            'pct_amerindian',
#             'rent_over_50_percent',
#             'total_pop',
            'median_age',
            'pct_commuters_by_public_transportation',
            'pct_households',
#             'pop_per_housing_unit',
#             'median_rent',
#             'mobile_homes',
#             'pct_armed_forces',
            'pct_employed',
            'pct_bachelors_degree_or_higher_25_64',
            'pct_children',
#             'pct_employed_education_health_social',
            'pct_not_us_citizen_pop',
            'Staffed All Beds']

In [None]:
def print_gs(gs, X_train, X_test, y_train, y_test):
    results = gs.cv_results_
    cv_mean = gs.cv_results_['mean_test_score'][gs.best_index_]
    cv_range = 2 * gs.cv_results_['std_test_score'][gs.best_index_]
    print('Best Score: ', gs.best_score_)
    print(f'Cross Val Score {cv_mean} +/- {cv_range}')
    print('Test Score: ', gs.score(X_test, y_test))
    print('Best Params: ', gs.best_params_)

## Data Preprocessing & Modeling

In [None]:
# # feature selection using Lasso
# def select_features(X, y, verbose=True):
    

In [None]:
X = df[features]
# X = df.drop(['death_rate', 'do_date'], axis=1)
y = df['death_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
pipe_lr = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(.95)),
    ('lr', LinearRegression())
])

pipe_rf = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(.95)),
    ('rf', RandomForestRegressor())
])

pipe_ada = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(.95)),
    ('ada', AdaBoostRegressor())
])

pipe_gb = Pipeline([
    ('ss', StandardScaler()),
    ('pca', PCA(.95)),
    ('gb', GradientBoostingRegressor())
])

In [None]:
# models = [()]
# for model in models

**Baseline**

In [None]:
preds_base = np.ones(X_test.shape[0]) * np.mean(y_test)
r2_score(y_test, preds_base)

**Linear Regression**

In [None]:
params_lr = {
    'pca__n_components': [0.9, 0.95, 0.99],  # 'mle'],  # 
    'pca__svd_solver': ['full']
}

gs_lr = GridSearchCV(pipe_lr, param_grid=params_lr, n_jobs=-1)
gs_lr.fit(X_train, y_train)
print_gs(gs_lr, X_train, X_test, y_train, y_test)

**Random Forest Regressor**

In [None]:
params_rf = {
    'rf__max_depth': [5, 10, 15, 20],
    'pca__n_components': [0.9, 0.95, 0.99],  # 'mle'],  # 
    'pca__svd_solver': ['full']
}
gs_rf = GridSearchCV(pipe_rf, param_grid=params_rf, n_jobs=-1)
gs_rf.fit(X_train, y_train)
print_gs(gs_rf, X_train, X_test, y_train, y_test)

In [None]:
# Best Score:  0.24892601047561308
# Cross Val Score [0.23869433 0.24892601 0.24315752 0.24273401] +/- [0.05422757 0.05574761 0.05725976 0.05731394]
# Test Score:  0.2869630413685961
# Best Params:  {'rf__max_depth': 10}

**Ada Boost Regressor**

In [None]:
params_ada = {
    'pca__n_components': [0.95, 0.99],  # 'mle'],  # 
    'pca__svd_solver': ['full'],
    'ada__base_estimator': [RandomForestRegressor(max_depth=3,
                                                  min_samples_leaf=10,
                                                  min_samples_split=10,
                                                  n_estimators=75)],
    'ada__learning_rate': [0.001]
}
gs_ada = GridSearchCV(pipe_ada, param_grid=params_ada, n_jobs=-1)
gs_ada.fit(X_train, y_train)
print_gs(gs_ada, X_train, X_test, y_train, y_test)

**Gradient Boosting Regressor**

In [None]:
params_gb = {
    'gb__max_depth': [5, 7, 10],
    'pca__n_components': [0.9, 0.95, 0.99],  # 'mle'],  # 
    'pca__svd_solver': ['full']
}
gs_gb = GridSearchCV(pipe_gb, param_grid=params_gb, n_jobs=-1)
gs_gb.fit(X_train, y_train)
print_gs(gs_gb, X_train, X_test, y_train, y_test)

**Voting Regressor**

In [None]:
vote = VotingRegressor([
    ('lr', pipe_lr),
    ('rf', pipe_rf)
])

params_vote = {
    'lr__pca__n_components': [0.9, 0.95, 0.99],  # 'mle'],  # 
    'lr__pca__svd_solver': ['full'],
    'rf__rf__max_depth': [5, 10, 15, 20],
}

gs_vote = GridSearchCV(vote,
                      param_grid=params_vote,
                      n_jobs=-1)
gs_vote.fit(X_train, y_train)
print_gs(gs_vote, X_train, X_test, y_train, y_test)

## Model Scores

**Death Rates**

|model|test score|cross val mean|cross val std|
| :--- | :---: | :---: | :---: |
| Baseline | 0 | - | - |
| Linear Regression | 0.250 | 0.225 | 0.102 |
| Random Forest Regressor | 0.244 | 0.215 | 0.115 |
| Ada Boost Regressor | 0.180 | 0.168 | 0.055 |
| Gradient Boosting Regressor | 0.223 | 0.177 | 0.111 |

In [None]:
gs_lr.best_estimator_.named_steps['lr']