In [1]:
import pandas as pd
import xgboost
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.externals import joblib

In [2]:
case_rainfall_features = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/full_feature_data.csv')
case_rainfall_features.drop('weekly_cases', axis = 1, inplace = True)

In [3]:
conflict_event_features = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/full_conflict_features.csv')

In [4]:
fatality_features = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/full_fatality_features.csv')

In [5]:
y_df = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/cholera_epi_data/y_normalized_df.csv')

In [6]:
y_df.head()

Unnamed: 0,gov_iso,date,next_week_cases,next_two_week_cases,next_four_week_cases,next_six_week_cases
0,YE-AB,2017-05-23,6.976066,14.570406,33.052834,84.310833
1,YE-AB,2017-05-24,7.085352,14.938739,32.211941,87.79583
2,YE-AB,2017-05-25,7.194638,15.600525,36.258546,91.78172
3,YE-AB,2017-05-26,7.303923,16.26231,37.284616,95.266718
4,YE-AB,2017-05-27,7.413209,16.924095,39.282115,97.846061


In [7]:
case_rainfall_features = case_rainfall_features[(case_rainfall_features['date'] > '2017-05-30')].sort_values('date')

In [8]:
full_features = case_rainfall_features.merge(conflict_event_features.drop('weekly_cases', axis=1), on = ['gov_iso', 'date'], how = 'left').merge(fatality_features.drop('fatalities', axis=1), on = ['gov_iso', 'date'], how = 'left').fillna(0)

In [10]:
full_features.date = pd.to_datetime(full_features.date, format = '%Y-%m-%d')

In [15]:
full_features = pd.concat([full_features, pd.get_dummies(full_features.gov_iso)], axis=1)

In [16]:
full_features.head()

Unnamed: 0,gov_iso,date,mean_past_week_cases,max_past_week_cases,kurtosis_past_week_cases,mean_past_2_week_cases,max_past_2_week_cases,kurtosis_past_2_week_cases,mean_past_month_cases,max_past_month_cases,...,YE-LA,YE-MA,YE-MR,YE-MW,YE-RA,YE-SA,YE-SD,YE-SH,YE-SN,YE-TA
0,YE-AB,2017-05-31,0.996581,1.123213,-1.916667,0.984709,1.123213,-1.733333,0.984709,1.123213,...,0,0,0,0,0,0,0,0,0,0
1,YE-SN,2017-05-31,1.3131,1.375641,-1.916667,1.320918,1.375641,-1.733333,1.320918,1.375641,...,0,0,0,0,0,0,0,0,1,0
2,YE-SH,2017-05-31,0.006578,0.015349,-1.916667,0.005756,0.015349,-1.733333,0.005756,0.015349,...,0,0,0,0,0,0,0,1,0,0
3,YE-BA,2017-05-31,2.44522,2.484196,-1.916667,2.450092,2.484196,-1.733333,2.450092,2.484196,...,0,0,0,0,0,0,0,0,0,0
4,YE-IB,2017-05-31,0.375929,0.408584,-1.916667,0.380011,0.408584,-1.733333,0.380011,0.408584,...,0,0,0,0,0,0,0,0,0,0


In [275]:
full_features.to_csv('/Users/Rohil/Documents/iGEM/yemen/full_features.csv', index = False)

In [17]:
full_features.iloc[:,164:185].shape

(5397, 21)

In [18]:
categorical_cols = (162, 183)

In [19]:
all_feature_list = full_features.drop('gov_iso', axis = 1).columns

In [20]:
full_features.date = pd.to_datetime(full_features.date, format = '%Y-%m-%d')
y_df.date = pd.to_datetime(y_df.date, format = '%Y-%m-%d')

In [21]:
full_features

Unnamed: 0,gov_iso,date,mean_past_week_cases,max_past_week_cases,kurtosis_past_week_cases,mean_past_2_week_cases,max_past_2_week_cases,kurtosis_past_2_week_cases,mean_past_month_cases,max_past_month_cases,...,YE-LA,YE-MA,YE-MR,YE-MW,YE-RA,YE-SA,YE-SD,YE-SH,YE-SN,YE-TA
0,YE-AB,2017-05-31,0.996581,1.123213,-1.916667,0.984709,1.123213,-1.733333,0.984709,1.123213,...,0,0,0,0,0,0,0,0,0,0
1,YE-SN,2017-05-31,1.313100,1.375641,-1.916667,1.320918,1.375641,-1.733333,1.320918,1.375641,...,0,0,0,0,0,0,0,0,1,0
2,YE-SH,2017-05-31,0.006578,0.015349,-1.916667,0.005756,0.015349,-1.733333,0.005756,0.015349,...,0,0,0,0,0,0,0,1,0,0
3,YE-BA,2017-05-31,2.445220,2.484196,-1.916667,2.450092,2.484196,-1.733333,2.450092,2.484196,...,0,0,0,0,0,0,0,0,0,0
4,YE-IB,2017-05-31,0.375929,0.408584,-1.916667,0.380011,0.408584,-1.733333,0.380011,0.408584,...,0,0,0,0,0,0,0,0,0,0
5,YE-SA,2017-05-31,4.643352,5.665804,-1.916667,4.771159,5.665804,-1.733333,4.771159,5.665804,...,0,0,0,0,0,1,0,0,0,0
6,YE-DA,2017-05-31,2.053912,2.500223,-1.916667,2.012071,2.500223,-1.733333,2.012071,2.500223,...,0,0,0,0,0,0,0,0,0,0
7,YE-SD,2017-05-31,0.028928,0.067499,-1.916667,0.025312,0.067499,-1.733333,0.025312,0.067499,...,0,0,0,0,0,0,1,0,0,0
8,YE-MW,2017-05-31,3.000866,3.025834,-1.916667,3.003987,3.025834,-1.733333,3.003987,3.025834,...,0,0,0,1,0,0,0,0,0,0
9,YE-TA,2017-05-31,0.717478,0.735545,-1.916667,0.715785,0.735545,-1.733333,0.715785,0.735545,...,0,0,0,0,0,0,0,0,0,1


In [22]:
full_data = full_features.merge(y_df, on = ['gov_iso', 'date'], how = 'left')

In [23]:
X = full_data[all_feature_list]

In [24]:
y1 = full_data[['date', 'next_week_cases']]
y2 = full_data[['date', 'next_two_week_cases']]
y4 = full_data[['date', 'next_four_week_cases']]
y6 = full_data[['date', 'next_six_week_cases']]

In [25]:
def scale_features(scaler, X, categorical_cols):
    
    X_numerical = X.iloc[:,:(categorical_cols[0])]
    X_categorical = X.iloc[:, categorical_cols[0]:categorical_cols[1]]
        
    X_numerical_scaled = pd.DataFrame(data=scaler.transform(X_numerical), columns = X_numerical.columns)
    
    X_scaled = pd.concat([X_numerical_scaled, X_categorical.reset_index(drop=True)], axis=1)
    
    return (X_scaled)

In [29]:
def train_base_selection_model(X, y, param_grid, split_date, categorical_cols):
    
    # some of the ending values are undefined purposely, as there isnt't enough data to get cholera cases for x weeks
    y = y.dropna()
    X = X.loc[y.index]
    
    print ('X-shape:')
    
    X_train, X_test = X[(X.date<split_date)].drop('date', axis=1), X[(X.date>=split_date)].drop('date', axis=1)
    y_train, y_test = y[(y.date<split_date)].drop('date', axis=1), y[(y.date>=split_date)].drop('date', axis=1)
    
    scaler = StandardScaler()
    scaler.fit(X_train.iloc[:,:categorical_cols[0]])
    
    X_train, X_test = scale_features(scaler, X_train, categorical_cols), scale_features(scaler, X_test, categorical_cols)
    
    xgb = xgboost.XGBRegressor()
    
    grid_search = GridSearchCV(xgb,
                        param_grid = param_grid,
                        cv = TimeSeriesSplit(n_splits=2).split(X_train),
                        n_jobs = -1,
                        scoring = 'explained_variance',
                        verbose=True)
    
    grid_search.fit(X_train, y_train)
    
    y_pred = grid_search.predict(X_test)
    score = explained_variance_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print('w/ all features... explained variance: %s, mean abs error: %s' % (score, mae))

    
    return(grid_search.best_estimator_, grid_search.best_params_)

In [30]:
base_param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.01, 0.1], #so called `eta` value
              'max_depth': [15, 30, 45],
              'min_child_weight': [15, 30, 45],
              'silent': [1],
              'subsample': [0.5, 0.65, 0.8],
              'colsample_bytree': [0.5, 0.65, 0.8],
              'n_estimators': [150, 300, 450]}

In [31]:
week1_xgb, week1_params = train_base_selection_model(X, y1, base_param_grid, '2017-09-25', categorical_cols)

X-shape:
Fitting 2 folds for each of 486 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 14.7min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 19.9min finished


w/ all features... explained variance: 0.8453091813500495, mean abs error: 1.333905877306957


In [32]:
week2_xgb, week2_params = train_base_selection_model(X, y2, base_param_grid, '2017-09-25', categorical_cols)

X-shape:
Fitting 2 folds for each of 486 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 19.3min finished


w/ all features... explained variance: 0.8294911522297492, mean abs error: 2.9135911808886314


In [33]:
week4_xgb, week4_params = train_base_selection_model(X, y4, base_param_grid, '2017-09-25', categorical_cols)

X-shape:
Fitting 2 folds for each of 486 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 18.6min finished


w/ all features... explained variance: 0.7364484919009746, mean abs error: 8.520425225922786


In [34]:
week6_xgb, week6_params = train_base_selection_model(X, y6, base_param_grid, '2017-09-25', categorical_cols)

X-shape:
Fitting 2 folds for each of 486 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 20.2min finished


w/ all features... explained variance: 0.6967571869858236, mean abs error: 13.025669800446712


In [35]:
week1_params

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 15,
 'min_child_weight': 45,
 'n_estimators': 150,
 'nthread': 4,
 'objective': 'reg:linear',
 'silent': 1,
 'subsample': 0.5}

In [36]:
week2_params

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 15,
 'min_child_weight': 45,
 'n_estimators': 150,
 'nthread': 4,
 'objective': 'reg:linear',
 'silent': 1,
 'subsample': 0.5}

In [37]:
week4_params

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 15,
 'min_child_weight': 15,
 'n_estimators': 150,
 'nthread': 4,
 'objective': 'reg:linear',
 'silent': 1,
 'subsample': 0.8}

In [38]:
week6_params

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 15,
 'min_child_weight': 15,
 'n_estimators': 150,
 'nthread': 4,
 'objective': 'reg:linear',
 'silent': 1,
 'subsample': 0.65}

In [39]:
fig, ax = plt.subplots(1,1, figsize = (10, 30))
xgboost.plot_importance(week1_xgb, ax=ax)
fig.tight_layout()
fig.savefig('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/xgb_1_week_feature_importance.png')
plt.clf()
plt.close('all')

In [40]:
fig, ax = plt.subplots(1,1, figsize = (10, 30))
xgboost.plot_importance(week2_xgb, ax=ax)
fig.tight_layout()
fig.savefig('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/xgb_2_week_feature_importance.png')
plt.clf()
plt.close('all')

In [41]:
fig, ax = plt.subplots(1,1, figsize = (10, 30))
xgboost.plot_importance(week4_xgb, ax=ax)
fig.tight_layout()
fig.savefig('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/xgb_4_week_feature_importance.png')
plt.clf()
plt.close('all')

In [42]:
fig, ax = plt.subplots(1,1, figsize = (10, 30))
xgboost.plot_importance(week6_xgb, ax=ax)
fig.tight_layout()
fig.savefig('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/xgb_6_week_feature_importance.png')
plt.clf()
plt.close('all')

In [46]:
print ('change' + str(1 + 2))

change3


In [212]:
thresholds = sorted(xgb.feature_importances_, reverse = True)
five_thresh = thresholds[0:100][0::5]

In [235]:

nfeature_gridsearch_df = pd.DataFrame(columns=['threshold', 'n_features', 'best_params', 'explained_variance', 'mae'])

param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.01, 0.1], #so called `eta` value
              'max_depth': [13, 26, 40, 55, 70],
              'min_child_weight': [15, 30, 45],
              'silent': [1],
              'subsample': [0.5, 0.65, 0.8],
              'colsample_bytree': [0.5, 0.65, 0.8],
              'n_estimators': [150, 300, 450],
}

for thresh in ten_thresh:
    # select features using threshold
    selection = SelectFromModel(xgb, threshold=thresh, prefit=True)
    
    selection_model = xgboost.XGBRegressor()
    
    grid_search = GridSearchCV(selection_model,
                        param_grid = param_grid,
                        cv = TimeSeriesSplit(n_splits=2).split(X_train),
                        n_jobs = -1,
                        scoring = 'explained_variance',
                        verbose=True)

    select_X_train = selection.transform(X_train)
    
    grid_search.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = grid_search.predict(select_X_test)
    score = explained_variance_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print('Thresh=%s, n=%s, explained variance: %s, mean abs error: %s' % (thresh, select_X_train.shape[1], score, mae))
    
    nfeature_gridsearch_df = nfeature_gridsearch_df.append(pd.Series({'threshold':thresh, 'n_features':select_X_train.shape[1], 'best_params': grid_search.best_params_, 'explained_variance':score, 'mae': mae}), ignore_index = True)
    

Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   31.1s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:   51.9s finished


Thresh=0.052142724, n=1, explained variance: 0.7985755517568307, mean abs error: 1.682402752652966
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.3min finished


Thresh=0.019496046, n=11, explained variance: 0.8387389576458124, mean abs error: 1.3579267749560204
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.6min finished


Thresh=0.0144381095, n=21, explained variance: 0.8122780399947718, mean abs error: 1.8012315756071982
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.9min finished


Thresh=0.011955122, n=32, explained variance: 0.8380658757657076, mean abs error: 1.3866316039521775
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.1min finished


Thresh=0.009748023, n=41, explained variance: 0.8403762107586134, mean abs error: 1.3753261106950196
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.3min finished


Thresh=0.008644473, n=51, explained variance: 0.8117298068041466, mean abs error: 1.842643472954376
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.6min finished


Thresh=0.007081111, n=61, explained variance: 0.8089961981600511, mean abs error: 1.8536112718445714
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.9min finished


Thresh=0.0036784993, n=71, explained variance: 0.8432242894275473, mean abs error: 1.343718461014256
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  3.1min finished


Thresh=0.0011955122, n=82, explained variance: 0.8433937407437808, mean abs error: 1.363207598570855
Fitting 2 folds for each of 162 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  3.3min finished


Thresh=0.0009196248, n=92, explained variance: 0.8421797044983584, mean abs error: 1.3672899649490031


In [240]:
nfeature_gridsearch_df.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/nfeatures_gridsearch_df_broad.csv', index = False)

In [241]:
five_thresh = thresholds[0::5]

In [242]:
nfeature_gridsearch_df = pd.DataFrame(columns=['threshold', 'n_features', 'best_params', 'explained_variance', 'mae'])

param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.005, 0.01, 0.05], #so called `eta` value
              'max_depth': [13],
              'min_child_weight': [30, 45],
              'silent': [1],
              'subsample': [0.4, 0.5],
              'colsample_bytree': [0.4, 0.5],
              'n_estimators': [100, 200, 300],
}

for thresh in five_thresh:
    # select features using threshold
    selection = SelectFromModel(xgb, threshold=thresh, prefit=True)
    
    selection_model = xgboost.XGBRegressor()
    
    grid_search = GridSearchCV(selection_model,
                        param_grid = param_grid,
                        cv = TimeSeriesSplit(n_splits=2).split(X_train),
                        n_jobs = -1,
                        scoring = 'explained_variance',
                        verbose=True)

    select_X_train = selection.transform(X_train)
    
    grid_search.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = grid_search.predict(select_X_test)
    score = explained_variance_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print('Thresh=%s, n=%s, explained variance: %s, mean abs error: %s' % (thresh, select_X_train.shape[1], score, mae))
    
    nfeature_gridsearch_df = nfeature_gridsearch_df.append(pd.Series({'threshold':thresh, 'n_features':select_X_train.shape[1], 'best_params': grid_search.best_params_, 'explained_variance':score, 'mae': mae}), ignore_index = True)
    

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   15.2s finished


Thresh=0.052142724, n=1, explained variance: 0.8335351288788623, mean abs error: 1.3950294505975602
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   17.5s finished


Thresh=0.024278095, n=6, explained variance: 0.8236411588958937, mean abs error: 1.4731335042049563
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   18.3s finished


Thresh=0.019496046, n=11, explained variance: 0.8390105019454293, mean abs error: 1.418869342944347
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   20.2s finished


Thresh=0.01636932, n=16, explained variance: 0.8365654154636647, mean abs error: 1.4134812064287474
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   22.7s finished


Thresh=0.0144381095, n=21, explained variance: 0.8377335509552191, mean abs error: 1.4768799403491468
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   25.6s finished


Thresh=0.013334559, n=26, explained variance: 0.8379547532262961, mean abs error: 1.468241493785251
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   25.3s finished


Thresh=0.011955122, n=32, explained variance: 0.8361640253300077, mean abs error: 1.471165131893177
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   26.5s finished


Thresh=0.010391761, n=37, explained variance: 0.8375258229343377, mean abs error: 1.4069615371674862
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   28.2s finished


Thresh=0.009748023, n=41, explained variance: 0.8365221865578999, mean abs error: 1.4686583466526653
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   28.9s finished


Thresh=0.00928821, n=46, explained variance: 0.839158358420628, mean abs error: 1.423672517378895
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   29.9s finished


Thresh=0.008644473, n=51, explained variance: 0.8398968668968193, mean abs error: 1.409362030855516
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   31.7s finished


Thresh=0.00818466, n=56, explained variance: 0.8394088269466211, mean abs error: 1.445609280982899
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   32.3s finished


Thresh=0.007081111, n=61, explained variance: 0.8422343174073523, mean abs error: 1.4656867768224506
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   33.8s finished


Thresh=0.0060695237, n=66, explained variance: 0.8434019000971056, mean abs error: 1.3950770257338183
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   38.0s finished


Thresh=0.0036784993, n=71, explained variance: 0.8455003625181486, mean abs error: 1.40705940511885
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   36.4s finished


Thresh=0.0016553246, n=76, explained variance: 0.8419938141561748, mean abs error: 1.4660405984939666
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   38.0s finished


Thresh=0.0011955122, n=82, explained variance: 0.8423608243986089, mean abs error: 1.471780743722383
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   39.6s finished


Thresh=0.0010115873, n=87, explained variance: 0.8366921476155748, mean abs error: 1.469125715446875
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   39.5s finished


Thresh=0.0009196248, n=92, explained variance: 0.8393899105136164, mean abs error: 1.4425775422174039
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   42.0s finished


Thresh=0.00073569984, n=98, explained variance: 0.8438677916819434, mean abs error: 1.4192869745351675
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   44.0s finished


Thresh=0.0006437374, n=106, explained variance: 0.8401862769795485, mean abs error: 1.4279932845796572
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   43.2s finished


Thresh=0.0006437374, n=106, explained variance: 0.8401862769795485, mean abs error: 1.4279932845796572
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   44.1s finished


Thresh=0.00055177487, n=111, explained variance: 0.8441855859484749, mean abs error: 1.3977677414862457
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   45.6s finished


Thresh=0.0004598124, n=117, explained variance: 0.8426195616597422, mean abs error: 1.4442475539289055
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   46.5s finished


Thresh=0.00036784992, n=123, explained variance: 0.8452999644520067, mean abs error: 1.3772791376019076
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   50.0s finished


Thresh=0.00027588743, n=135, explained variance: 0.8403074497336995, mean abs error: 1.431986662439098
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   50.1s finished


Thresh=0.00027588743, n=135, explained variance: 0.8403074497336995, mean abs error: 1.431986662439098
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   51.4s finished


Thresh=0.00018392496, n=145, explained variance: 0.8389228753133358, mean abs error: 1.4149864377275878
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   51.4s finished


Thresh=0.00018392496, n=145, explained variance: 0.8389228753133358, mean abs error: 1.4149864377275878
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   54.0s finished


Thresh=9.196248e-05, n=159, explained variance: 0.8404637611621799, mean abs error: 1.4199106765663099
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   54.4s finished


Thresh=9.196248e-05, n=159, explained variance: 0.8404637611621799, mean abs error: 1.4199106765663099
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   54.3s finished


Thresh=9.196248e-05, n=159, explained variance: 0.8404637611621799, mean abs error: 1.4199106765663099
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  1.0min finished


Thresh=0.0, n=183, explained variance: 0.8401241066283188, mean abs error: 1.431569199237605
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   59.1s finished


Thresh=0.0, n=183, explained variance: 0.8401241066283188, mean abs error: 1.431569199237605
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   59.4s finished


Thresh=0.0, n=183, explained variance: 0.8401241066283188, mean abs error: 1.431569199237605
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   59.5s finished


Thresh=0.0, n=183, explained variance: 0.8401241066283188, mean abs error: 1.431569199237605
Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   59.4s finished


Thresh=0.0, n=183, explained variance: 0.8401241066283188, mean abs error: 1.431569199237605


In [252]:
nfeature_gridsearch_df.to_csv('/Users/Rohil/Documents/iGEM/yemen/feature_engineering/nfeatures_gridsearch_df.csv', index = False)

In [158]:
preds = pd.Series(preds)

In [159]:
preds.index = y_test.index

In [160]:
prediction_df = pd.DataFrame(preds).merge(pd.DataFrame(full_data[['gov_iso', 'date']]), left_index = True, right_index = True)
prediction_df.columns = ['pred', 'gov_iso' ,'date']

pred_crosstab = prediction_df.pivot_table(index = 'date', columns = 'gov_iso', values = 'pred')

In [161]:
pred_crosstab.head()

gov_iso,YE-AB,YE-AD,YE-AM,YE-BA,YE-DA,YE-DH,YE-HD-AL,YE-HJ,YE-HU,YE-IB,...,YE-LA,YE-MA,YE-MR,YE-MW,YE-RA,YE-SA,YE-SD,YE-SH,YE-SN,YE-TA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-14,20.710947,4.897898,29.631374,10.169798,30.773361,21.468771,1.763795,17.378012,8.233446,5.356342,...,14.901932,4.875543,2.443946,27.455353,11.457116,22.700748,4.003278,1.062638,3.486799,3.953214
2017-08-15,20.583555,4.934886,30.043766,9.911448,28.758907,21.501932,1.720262,18.62438,8.206692,5.742295,...,14.471645,4.805513,2.414942,27.462757,11.740501,25.895361,3.997659,1.190431,3.157129,3.378301
2017-08-16,20.732105,5.1381,29.280275,10.165749,29.152632,21.501354,1.601228,18.985281,8.106464,5.833121,...,12.429628,4.876575,2.514567,25.924494,12.04758,26.815323,3.698807,1.17716,3.233928,4.321141
2017-08-17,21.188223,5.064285,29.029499,15.176261,27.624121,21.160444,1.606588,19.460413,8.119987,5.795109,...,13.946028,4.937258,2.428747,25.438599,12.1623,26.941118,4.514264,1.128298,3.218765,4.857874
2017-08-18,21.926449,5.023776,28.968809,15.482224,27.531481,19.783064,1.544193,20.020969,8.407865,5.705154,...,14.573035,4.978606,2.463442,25.125048,12.514949,27.085239,4.581815,1.080929,3.924248,4.28511


In [162]:
y_test_df = y.drop('date', axis=1).merge(pd.DataFrame(full_data[['gov_iso' ,'date']]), left_index = True, right_index = True)

In [163]:
y_test_crosstab = y_test_df.pivot_table(index = 'date', columns = 'gov_iso', values = 'weekly_cases')

In [164]:
y_test_crosstab.head()

gov_iso,YE-AB,YE-AD,YE-AM,YE-BA,YE-DA,YE-DH,YE-HD-AL,YE-HJ,YE-HU,YE-IB,...,YE-LA,YE-MA,YE-MR,YE-MW,YE-RA,YE-SA,YE-SD,YE-SH,YE-SN,YE-TA
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-31,7.853387,13.167803,32.415759,11.652744,16.109842,9.466921,0.0,18.740863,9.986982,8.009573,...,2.625525,2.448999,0.0,23.163472,11.446652,39.580615,0.168074,0.012279,11.477125,5.678411
2017-06-01,8.405887,11.983601,33.929278,12.699393,21.715429,10.586107,0.0,19.486937,11.311516,9.337257,...,3.664075,2.632608,0.0,23.933585,14.303009,42.529852,0.180898,0.039907,12.512657,6.571019
2017-06-02,8.958386,10.7994,35.442796,13.746042,27.321017,11.705293,0.0,20.23301,12.636049,10.664942,...,4.702626,2.816217,0.0,24.703698,17.159365,45.479089,0.193723,0.067536,13.548189,7.463628
2017-06-03,9.510886,9.615199,36.956315,14.792691,32.926605,12.824479,0.0,20.979084,13.960583,11.992626,...,5.741176,2.999826,0.0,25.473811,20.015721,48.428327,0.206548,0.095164,14.583722,8.356236
2017-06-04,9.49419,6.325547,35.491447,13.803081,32.798828,12.447947,0.0,19.014812,13.478453,12.228111,...,6.698759,3.09097,0.0,23.915834,20.636611,46.302799,0.192373,0.092094,13.425218,8.417056


In [165]:
def set_style(color):
    plt.style.use(['seaborn-' + color, 'seaborn-paper'])

In [166]:
def plot_pred_against_actual(pred_crosstab, test_crosstab):
    
    set_style('white')
    
    fig, ax = plt.subplots(21,1,figsize = (6,15), sharex=True)
    
    cols = test_crosstab.columns
    
    for i in range(0,21):
        
        test_crosstab[cols[i]].plot(kind='line', ax = ax[i], label = 'true_val', legend = True, color = 'red')
                            
        pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgb', legend = True, color = 'blue')   
          
        ax[i].legend().set_visible(False)
        ax[i].set_ylabel(cols[i])
        ax[i].yaxis.set_label_position('right')
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['bottom'].set_visible(True)
        
    ax[10].legend().set_visible(True)
    ax[10].legend(fontsize=10, loc='center left', bbox_to_anchor=(1.05, 0.5))
 
    fig.subplots_adjust(hspace = .2)    
        
    fig.savefig('/Users/Rohil/Documents/iGEM/yemen/plot_xgb_all_features.png', dpi = 500, bbox_inches = 'tight')
    
    plt.close('all')

In [167]:
plot_pred_against_actual(pred_crosstab, y_test_crosstab)

In [185]:
feature_importance_df = pd.Series(xgb.feature_importances_, index = X_full.drop('date', axis=1).columns).sort_values(ascending = False)