# PROVIDENT: Gradient Boosting Machine

In [None]:
# Load packages
import numpy as np
import pandas as pd
import csv
import scipy
import sklearn
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
# Importing the desired datasets
ems = pd.read_csv("filepath\\ems.csv")
pdmp = pd.read_csv("filepath\\pdmp.csv")
acs = pd.read_csv("filepath\\acs.csv")
rigis = pd.read_csv("filepath\\rigis.csv")
od = pd.read_csv("filepath\\sudors.csv")

In [None]:
time_varying = pd.merge(pdmp,ems, on='GEOID')
time_varying.head()

In [None]:
time_fixed = pd.merge(acs,rigis, on='GEOID')
time_fixed.head()

In [None]:
print(time_fixed.shape)
print(time_varying.shape)

In [None]:
od = od.rename(columns={'municipality': 'TOWN'})
od.head()

# Run these chunks to format the data

In [None]:
# Capturing the "ground truth" rankings for MSE and R2
ground_truth_20200 = od['2020_1']
ground_truth_20195 = od['2019_2']
ground_truth_20190 = od['2019_1']
ground_truth_20185 = od['2018_2']
ground_truth_20180 = od['2018_1']
ground_truth_20175 = od['2017_2']

In [None]:
## MODELING APPROACH
# Training:
# (1) Predict 2020.1 (t) given 2019.2 (t-1) and 2019.1 (t-2) AND
# (2) Predict 2019.2 (t) given 2019.1 (t-1) and 2018.2 (t-2) AND
# (3) Predict 2019.1 (t) given 2018.2 (t-1) and 2018.1 (t-2) AND
# (4) Predict 2018.2 (t) given 2018.1 (t-1) and 2017.2 (t-2) AND
# (5) Predict 2018.1 (t) given 2017.2 (t-1) and 2017.1 (t-2) AND
# (6) Predict 2017.2 (t) given 2017.1 (t-1) and 2016.2 (t-2) AND
# (7) Predict 2017.1 (t) given 2016.2 (t-1) and 2016.1 (t-2)
#
# Test: (1) Predict 2020.1 (t) given 2019.2 (t-1) and 2019.1 (t-2)

# Constructing the moving windows with time-varying data

In [None]:
# Specifying the column suffixes for training set X
# FOR TIME VARYING COMBINED PDMP+EMS
x_train_varying_11 = time_varying.columns[time_varying.columns.str.contains('_2019_2')]
x_train_varying_12 = time_varying.columns[time_varying.columns.str.contains('_2019_1')]
x_train_varying_21 = time_varying.columns[time_varying.columns.str.contains('_2019_1')]
x_train_varying_22 = time_varying.columns[time_varying.columns.str.contains('_2018_2')]
x_train_varying_31 = time_varying.columns[time_varying.columns.str.contains('_2018_2')]
x_train_varying_32 = time_varying.columns[time_varying.columns.str.contains('_2018_1')]
x_train_varying_41 = time_varying.columns[time_varying.columns.str.contains('_2018_1')]
x_train_varying_42 = time_varying.columns[time_varying.columns.str.contains('_2017_2')]
x_train_varying_51 = time_varying.columns[time_varying.columns.str.contains('_2017_2')]
x_train_varying_52 = time_varying.columns[time_varying.columns.str.contains('_2017_1')]
x_train_varying_61 = time_varying.columns[time_varying.columns.str.contains('_2017_1')]
x_train_varying_62 = time_varying.columns[time_varying.columns.str.contains('_2016_2')]
x_train_varying_71 = time_varying.columns[time_varying.columns.str.contains('_2016_2')]
x_train_varying_72 = time_varying.columns[time_varying.columns.str.contains('_2016_1')]

In [None]:
# FOR TIME VARYING COMBINED PDMP+EMS
x_train_11 = time_varying.loc[:,x_train_varying_11]
x_train_12 = time_varying.loc[:,x_train_varying_12]
x_train_21 = time_varying.loc[:,x_train_varying_21]
x_train_22 = time_varying.loc[:,x_train_varying_22]
x_train_31 = time_varying.loc[:,x_train_varying_31]
x_train_32 = time_varying.loc[:,x_train_varying_32]
x_train_41 = time_varying.loc[:,x_train_varying_41]
x_train_42 = time_varying.loc[:,x_train_varying_42]
x_train_51 = time_varying.loc[:,x_train_varying_51]
x_train_52 = time_varying.loc[:,x_train_varying_52]
x_train_61 = time_varying.loc[:,x_train_varying_61]
x_train_62 = time_varying.loc[:,x_train_varying_62]
x_train_71 = time_varying.loc[:,x_train_varying_71]
x_train_72 = time_varying.loc[:,x_train_varying_72]

In [None]:
x_train_11.columns = x_train_11.columns.str.replace('2019_2', 't-1')
x_train_12.columns = x_train_12.columns.str.replace('2019_1', 't-2')
x_train_21.columns = x_train_21.columns.str.replace('2019_1', 't-1')
x_train_22.columns = x_train_22.columns.str.replace('2018_2', 't-2')
x_train_31.columns = x_train_31.columns.str.replace('2018_2', 't-1')
x_train_32.columns = x_train_32.columns.str.replace('2018_1', 't-2')
x_train_41.columns = x_train_41.columns.str.replace('2018_1', 't-1')
x_train_42.columns = x_train_42.columns.str.replace('2017_2', 't-2')
x_train_51.columns = x_train_51.columns.str.replace('2017_2', 't-1')
x_train_52.columns = x_train_52.columns.str.replace('2017_1', 't-2')
x_train_61.columns = x_train_61.columns.str.replace('2017_1', 't-1')
x_train_62.columns = x_train_62.columns.str.replace('2016_2', 't-2')
x_train_71.columns = x_train_71.columns.str.replace('2016_2', 't-1')
x_train_72.columns = x_train_72.columns.str.replace('2016_1', 't-2')

In [None]:
x_train_1 = pd.concat([x_train_11,x_train_12],axis=1)
x_train_2 = pd.concat([x_train_21,x_train_22],axis=1)
x_train_3 = pd.concat([x_train_31,x_train_32],axis=1)
x_train_4 = pd.concat([x_train_41,x_train_42],axis=1)
x_train_5 = pd.concat([x_train_51,x_train_52],axis=1)
x_train_6 = pd.concat([x_train_61,x_train_62],axis=1)
x_train_7 = pd.concat([x_train_71,x_train_72],axis=1)

In [None]:
x_train_varying_a = pd.concat([x_train_1,x_train_2])
x_train_varying_b = pd.concat([x_train_varying_a,x_train_3])
x_train_varying_c = pd.concat([x_train_varying_b,x_train_4])
x_train_varying_d = pd.concat([x_train_varying_c,x_train_5])
x_train_varying_e = pd.concat([x_train_varying_d,x_train_6])
x_train_varying = pd.concat([x_train_varying_e,x_train_7])

In [None]:
x_train_varying.head()

In [None]:
x_train_varying.shape

# Constructing the moving windows with time-fixed data

In [None]:
acs_fixed = time_fixed.loc[:,'var_name':'var_name']
rigis_fixed = time_fixed.loc[:,'var_name':'var_name']

In [None]:
time_fixed = pd.concat([acs_fixed,rigis_fixed],axis=1)
time_fixed.head()

In [None]:
x_train_fixed_a = pd.concat([time_fixed,time_fixed])
x_train_fixed_b = pd.concat([x_train_fixed_a,time_fixed])
x_train_fixed_c = pd.concat([x_train_fixed_b,time_fixed])
x_train_fixed_d = pd.concat([x_train_fixed_c,time_fixed])
x_train_fixed_e = pd.concat([x_train_fixed_d,time_fixed])
x_train_fixed = pd.concat([x_train_fixed_e,time_fixed])

In [None]:
x_train_fixed.head()

In [None]:
x_train_od1 = od[['2019_2','2019_1']]
x_train_od2 = od[['2019_1','2018_2']]
x_train_od3 = od[['2018_2','2018_1']]
x_train_od4 = od[['2018_1','2017_2']]
x_train_od5 = od[['2017_2','2017_1']]
x_train_od6 = od[['2017_1','2016_2']]
x_train_od7 = od[['2016_2','2016_1']]

In [None]:
x_train_od1.columns = x_train_od1.columns.str.replace('2019_2', 't-1')
x_train_od1.columns = x_train_od1.columns.str.replace('2019_1', 't-2')
x_train_od2.columns = x_train_od2.columns.str.replace('2019_1', 't-1')
x_train_od2.columns = x_train_od2.columns.str.replace('2018_2', 't-2')
x_train_od3.columns = x_train_od3.columns.str.replace('2018_2', 't-1')
x_train_od3.columns = x_train_od3.columns.str.replace('2018_1', 't-2')
x_train_od4.columns = x_train_od4.columns.str.replace('2018_1', 't-1')
x_train_od4.columns = x_train_od4.columns.str.replace('2017_2', 't-2')
x_train_od5.columns = x_train_od5.columns.str.replace('2017_2', 't-1')
x_train_od5.columns = x_train_od5.columns.str.replace('2017_1', 't-2')
x_train_od6.columns = x_train_od6.columns.str.replace('2017_1', 't-1')
x_train_od6.columns = x_train_od6.columns.str.replace('2016_2', 't-2')
x_train_od7.columns = x_train_od7.columns.str.replace('2016_2', 't-1')
x_train_od7.columns = x_train_od7.columns.str.replace('2016_1', 't-2')

In [None]:
x_train_od_a = pd.concat([x_train_od1,x_train_od2])
x_train_od_b = pd.concat([x_train_od_a,x_train_od3])
x_train_od_c = pd.concat([x_train_od_b,x_train_od4])
x_train_od_d = pd.concat([x_train_od_c,x_train_od5])
x_train_od_e = pd.concat([x_train_od_d,x_train_od6])
x_train_od = pd.concat([x_train_od_e,x_train_od7])
x_train_od.head()

In [None]:
print(x_train_fixed.shape)
print(x_train_varying.shape)
print(x_train_od.shape)

In [None]:
x_train_a = pd.concat([x_train_fixed,x_train_varying],axis=1)
x_train = pd.concat([x_train_a,x_train_od],axis=1)

In [None]:
x_train.head()

In [None]:
x_train.shape

In [None]:
y_train1 = od[['2020_1']]
y_train1.columns = ['t']
y_train2 = od[['2019_2']]
y_train2.columns = ['t']
y_train3 = od[['2019_1']]
y_train3.columns = ['t']
y_train4 = od[['2018_2']]
y_train4.columns = ['t']
y_train5 = od[['2018_1']]
y_train5.columns = ['t']
y_train6 = od[['2017_2']]
y_train6.columns = ['t']
y_train7 = od[['2017_1']]
y_train7.columns = ['t']

In [None]:
y_train_a = pd.concat([y_train1,y_train2])
y_train_b = pd.concat([y_train_a,y_train3])
y_train_c = pd.concat([y_train_b,y_train4])
y_train_d = pd.concat([y_train_c,y_train5])
y_train_e = pd.concat([y_train_d,y_train6])
y_train = pd.concat([y_train_e,y_train7])
y_train.head()

# Test set data

In [None]:
# Test: (1) Predict 2020.2 (FUTURE) (t) given 2020.1 (t-1) and 2019.2 (t-2)

In [None]:
# Specifying the column suffixes for test set X
x_test_varying_1 = time_varying.columns[time_varying.columns.str.contains('_2020_1')]
x_test_varying_2 = time_varying.columns[time_varying.columns.str.contains('_2019_2')]

In [None]:
# Pulling the columns for test set X
x_test_varying1 = time_varying.loc[:,x_test_varying_1]
x_test_varying2 = time_varying.loc[:,x_test_varying_2]

In [None]:
x_test_varying1.columns = x_test_varying1.columns.str.replace('2020_1', 't-1')
x_test_varying2.columns = x_test_varying2.columns.str.replace('2019_2', 't-2')

In [None]:
x_test_varying = pd.concat([x_test_varying1,x_test_varying2],axis=1)
x_test_varying.head()

In [None]:
x_test_od = od[['2020_1','2019_2']]

In [None]:
x_test_od.columns = x_test_od.columns.str.replace('2020_1', 't-1')
x_test_od.columns = x_test_od.columns.str.replace('2019_2', 't-2')

In [None]:
x_test_od.head()

In [None]:
print(time_fixed.shape)
print(x_test_varying.shape)
print(x_test_od.shape)

In [None]:
x_test_a = pd.concat([time_fixed,x_test_varying],axis=1)
x_test = pd.concat([x_test_a,x_test_od],axis=1)

In [None]:
x_test.head()

In [None]:
# NO Y TEST BECAUSE WE ARE PREDICTING THE FUTURE

In [None]:
print(x_test.shape)

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
print(x_test.shape)

In [None]:
print(x_train.shape)
print(y_train.shape)

# Feature selection

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
x_train_scaled = pd.DataFrame(x_train_scaled)

In [None]:
x_train_scaled.columns = x_train.columns
x_train_scaled.head()

In [None]:
sel_ = SelectFromModel(estimator=ElasticNetCV(max_iter=10000,tol=0.008))
sel_.fit(x_train_scaled,y_train.values.ravel())
sel_.get_support()
x_train_selected = pd.DataFrame(x_train, columns = x_train.columns)

In [None]:
selected_feats = x_train_selected.columns[(sel_.get_support())]
print('Total features: {}'.format((x_train_scaled.shape[1])))
print('Selected features: {}'.format(len(selected_feats)))
print('Features with coef at 0: {}'.format(np.sum(sel_.estimator_.coef_==0)))

In [None]:
selected_feats

In [None]:
removed_feats = x_train.columns[(sel_.estimator_.coef_==0).ravel().tolist()]
removed_feats

In [None]:
x_train_selected = sel_.transform(x_train)
x_test_selected = sel_.transform(x_test)
print(x_test_selected.shape)
print(x_train_selected.shape)

# Modeling

In [None]:
# Random forest and sci-kit GBM
param_grid = {'max_depth':range(1,9,1), 'min_samples_leaf':[1,2,5,10,20,50,100], 'max_features':[0.33,'auto'], 'n_estimators':range(50,300,50)}
gb = GradientBoostingRegressor(random_state=888)
gs = GridSearchCV(gb,param_grid=param_grid,cv=5)
rs = gs.fit(x_train_selected,y_train.values.ravel())
print(rs.best_params_)
print(rs.score(x_train_selected,y_train))

In [None]:
# Obtaining predictions for the validation and test sets
sudors['FINAL'] = rs.predict(x_test_selected)
sudors.head()