Change from 3.1

- predict diff of target

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()
df_all.keys()

In [None]:
df_all['features_train'].head(n=2)

In [None]:
df_all['labels_train'].head(n=2)

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())
    print(df_all[k].shape)

## append without seasonality

In [None]:
n_diff = 1
for k in ['features_train', 'features_test']:
    temp_no = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.diff(periods=n_diff).iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    temp_no.columns = ["%s_diff"%x for x in temp_no.columns]
    assert ~(pd.isnull(temp_no).any().any())
    
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = pd.concat([temp_yes, temp_no], axis=1)
    print(df_all[k].shape)


In [None]:
for k in ['features_train', 'features_test']:
    assert ~(pd.isnull(df_all[k]).any().any())

In [None]:
# drop n_diff points from labels as well
for k in ['labels_train']:
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = temp_yes
    print(df_all[k].shape)


In [None]:
df_all['labels_train'].groupby('city').head(n=5)

## train/test split

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
selected_features = df_all['features_train'].columns

# check no missing
assert len(set(selected_features) - set(df_all['features_train'].columns))==0


In [None]:
df_all['features_train'].shape, df_all['labels_train'].shape

In [None]:
# note avoiding class bias
x_train = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )
y_test = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

## features = 5-pt rolling avg(features)

In [None]:
x_train['ndvi_ne'].groupby('city').plot()
plt.legend()
plt.show()

In [None]:
def my_rollingmean(df):
    return df.rolling(window=5, center=True, axis=0).mean().fillna(value=0)

x_train = (x_train
          .groupby(level='city', as_index=False)
          .apply(my_rollingmean)
          #.reset_index(level=0, drop=True)
          )
x_test = (x_test
          .groupby(level='city', as_index=False)
          .apply(my_rollingmean)
          #.reset_index(level=0, drop=True)
         )

In [None]:
x_train['ndvi_ne'].groupby('city').plot()
plt.legend()
plt.show()

## target = diff(mavg(target))

In [None]:
y_train.groupby('city').plot()
plt.legend()
plt.show()

In [None]:
y_train = my_rollingmean(y_train)
y_test = my_rollingmean(y_test)

In [None]:
y_train.groupby('city').plot()
plt.legend()
plt.show()

In [None]:
y_train = y_train.diff().fillna(value=0)
y_test = y_test.diff().fillna(value=0)

In [None]:
y_train.groupby('city').plot()
plt.legend()
plt.show()

## fit RF

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
def create_model():
    return RandomForestRegressor(n_estimators=100, min_samples_split=5, min_samples_leaf=3)


mod1_sj = create_model()
mod1_sj.fit(X = x_train.loc['sj'], y = y_train.loc['sj'])
mod1_iq = create_model()
mod1_iq.fit(X = x_train.loc['iq'], y = y_train.loc['iq'])

## check feature importances

In [None]:
#feat_imp = mod1_sj.feature_importances_
#feat_imp.sort()
#feat_imp, mod1_sj.feature_importances_, mod1_sj.feature_importances_.argsort(), 
xxx = mod1_sj.feature_importances_
xxx.sort()
xxx[-10:]

In [None]:
selected_features[mod1_sj.feature_importances_.argsort()[-10:]]

In [None]:
xxx = mod1_iq.feature_importances_
xxx.sort()
xxx[-10:]

In [None]:
selected_features[mod1_iq.feature_importances_.argsort()[-10:]]

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (y_test.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_test.loc['sj']).astype(int)
predictions.loc['iq'] = mod1_iq.predict(x_test.loc['iq']).astype(int)

#predictions = (pd.DataFrame(predictions)
#               .groupby('city', as_index=False)
#               .apply(lambda group: group.cumsum())
#               #.reset_index(level=0, drop=True)
#              )

predictions.loc['sj'].head()

In [None]:
'sj', mod1_sj.score(x_test.loc['sj'], y_test.loc['sj']), 'iq', mod1_iq.score(x_test.loc['iq'], y_test.loc['iq'])

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## re-fit on complete dataset

In [None]:
# note avoiding class bias
x_retrain = df_all['features_train'][selected_features]
y_retrain = df_all['labels_train']['total_cases']

def my_rollingmean(df):
    return df.rolling(window=5, center=True, axis=0).mean().fillna(value=0)

x_retrain = (x_retrain
          .groupby(level='city', as_index=False)
          .apply(my_rollingmean)
          )
y_retrain = y_retrain.diff().fillna(value=0)


mod1_sj = create_model()
mod1_sj.fit(X = x_retrain.loc['sj'], y = y_retrain.loc['sj'])
mod1_iq = create_model()
mod1_iq.fit(X = x_retrain.loc['iq'], y = y_retrain.loc['iq'])

## set in submission

In [None]:
df_all['submission'].loc['sj'].head()

In [None]:
# cast to int since we know the label is integer
predictions = (df_all['submission'][['total_cases']]
               .groupby(level='city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
               .copy()
               *0
              ).astype('int')

predictions.loc['sj', 'total_cases'] = mod1_sj.predict(df_all['features_test'].loc['sj', selected_features]).astype(int)
predictions.loc['iq', 'total_cases'] = mod1_iq.predict(df_all['features_test'].loc['iq', selected_features]).astype(int)

In [None]:
predictions.groupby(level='city').head(n=2)

In [None]:
submit = df_all['submission'].copy()
# TODO Will this match indeces properly?
# submit['total_cases'] = predictions

del submit['total_cases']

submit = submit.merge(
    predictions,
    left_index=True,
    right_index=True,
    how='left'
)
submit['total_cases'] = submit['total_cases'].fillna(value=0)

In [None]:
submit.shape

In [None]:
submit.groupby('city').head(n=2)

## plot

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission

In [None]:
make_submission(submit.reset_index())