Change from 3.1

- change regression to classification

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()
df_all.keys()

In [None]:
df_all['features_train'].head(n=2)

In [None]:
df_all['labels_train'].head(n=2)

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())
    print(df_all[k].shape)

## append without seasonality

## features = rolling mean(features)

In [None]:
def my_rollingmean(df):
    return df.rolling(window=5, center=False, axis=0).mean().fillna(value=0)


In [None]:
df_all2 = {}
for k in df_all.keys():
    df_all2[k] = df_all[k].copy()

for k in ['features_train', 'features_test']:
    df_all2[k] = (df_all2[k]
              .groupby(level='city', as_index=False)
              .apply(my_rollingmean)
              #.reset_index(level=0, drop=True)
              )

## convert target to categories

- no epidemic
- yes epidemic

In [None]:
# FIXME use correlation with a sample signal
df_all2['labels_train']['is_epidemic'] = df_all2['labels_train']['total_cases'].apply(lambda x: x >= 20)

def require_consecutive(series):
    return (
        series
         .astype('int')
         .groupby('city')
         # require 4/5 consecutive is_epidemic points
         .apply(
             lambda group: (
                 group.rolling(window=5,center=True)
                      .mean()
                      .fillna(value=0) > 0.8
             ).apply(lambda x: max(x,0)>0)
         )
    )

df_all2['labels_train']['is_epidemic'] = require_consecutive(df_all2['labels_train']['is_epidemic'])
# df_all2['labels_train'].groupby(['city','is_epidemic']).head(n=2)

## what is the average non-epidemic count?

## target = diff(rolling mean(target))

## train/test split

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
selected_features = df_all2['features_train'].columns

# check no missing
assert len(set(selected_features) - set(df_all2['features_train'].columns))==0


In [None]:
df_all2['features_train'].shape, df_all2['labels_train'].shape

In [None]:
# note avoiding class bias
x_train = (df_all2['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all2['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = (df_all2['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )
y_test = (df_all2['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )

# auxiliary input
z_train = (df_all2['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          ['is_epidemic']
         )
z_test = (df_all2['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          ['is_epidemic']
         )


x_train.shape, x_test.shape, y_train.shape, y_test.shape, z_train.shape, z_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

## fit RF1 on `is_epidemic`

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
def create_model_classifier():
    return RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=3)

def create_model_regressor():
    return RandomForestRegressor(n_estimators=100, min_samples_split=5, min_samples_leaf=3)

mod1_sj = create_model_classifier()
mod1_sj.fit(X = x_train.loc['sj'], y = z_train.loc['sj'])
mod1_iq = create_model_classifier()
mod1_iq.fit(X = x_train.loc['iq'], y = z_train.loc['iq'])

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (y_test.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_test.loc['sj']).astype(int)
predictions.loc['iq'] = mod1_iq.predict(x_test.loc['iq']).astype(int)

#predictions = require_consecutive(predictions)

In [None]:
predictions.loc['iq'].head()

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(z_test.loc[city]+2, label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## confusion matrix
because the cool kids have it

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix
for city in ['sj', 'iq']:
    print(city)
    print(confusion_matrix(z_test.loc[city], predictions.loc[city]))

## check feature importances

In [None]:
#feat_imp = mod1_sj.feature_importances_
#feat_imp.sort()
#feat_imp, mod1_sj.feature_importances_, mod1_sj.feature_importances_.argsort(), 
xxx = mod1_sj.feature_importances_
xxx.sort()
xxx#[-40:]

In [None]:
selected_features[mod1_sj.feature_importances_.argsort()] # [-10:]

In [None]:
xxx = mod1_iq.feature_importances_
xxx.sort()
xxx[-10:]

In [None]:
selected_features[mod1_iq.feature_importances_.argsort()[-10:]]

## Append `is_epidemic` to features

In [None]:
# use computed is_epidemic
x_train2 = pd.concat([x_train,z_train], axis=1)
x_test2 = pd.concat([x_test,z_test], axis=1)

# ignore is_epidemic
#x_train2 = x_train.copy()
#x_test2 = x_test.copy()

## train RF2 on `total_cases`

In [None]:
mod2_sj = create_model_regressor()
mod2_sj.fit(X = x_train2.loc['sj'], y = y_train.loc['sj'])
mod2_iq = create_model_regressor()
mod2_iq.fit(X = x_train2.loc['iq'], y = y_train.loc['iq'])

In [None]:
xxx=mod2_sj.feature_importances_
xxx.sort()
xxx

In [None]:
x_train2.columns[mod2_sj.feature_importances_.argsort()]

In [None]:
xxx=mod2_iq.feature_importances_
xxx.sort()
xxx

In [None]:
x_train2.columns[mod2_iq.feature_importances_.argsort()]

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions2 = (y_test.copy()*0).astype('int')

predictions2.loc['sj'] = mod2_sj.predict(x_test2.loc['sj']).astype(int)
predictions2.loc['iq'] = mod2_iq.predict(x_test2.loc['iq']).astype(int)

#predictions2 = require_consecutive(predictions2)

predictions2.loc['sj'].head()

In [None]:
[(city, mod2_sj.score(x_test2.loc[city], y_test.loc[city])) for city in ['sj','iq']]

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions2.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## re-fit on complete dataset

In [None]:
# prepare
x_retrain = df_all2['features_train'] # [selected_features]
y_retrain = df_all2['labels_train']['total_cases']
z_retrain = df_all2['labels_train']['is_epidemic']

# classifier
mod3_sj = create_model_classifier()
mod3_sj.fit(X = x_retrain.loc['sj'], y = z_retrain.loc['sj'])
mod3_iq = create_model_classifier()
mod3_iq.fit(X = x_retrain.loc['iq'], y = z_retrain.loc['iq'])

# use calculated is_epidemic
x_retrain2 = pd.concat([x_retrain,z_retrain], axis=1)

# regressor
mod4_sj = create_model_regressor()
mod4_sj.fit(X = x_retrain2.loc['sj'], y = y_retrain.loc['sj'])
mod4_iq = create_model_regressor()
mod4_iq.fit(X = x_retrain2.loc['iq'], y = y_retrain.loc['iq'])


## set in submission

In [None]:
# cast to int since we know the label is integer
#predictions3 = (df_all2['submission'][['total_cases']]
#               .groupby(level='city', as_index=False)
#               .apply(lambda group: group.iloc[n_diff:])
#               .reset_index(level=0, drop=True)
#               .copy()
#               *0
#              ).astype('int')

predictions3 = (df_all2['submission'][['total_cases']]
               .groupby(level='city', as_index=False)
               .apply(lambda group: group) #.iloc[n_diff:])
               #.reset_index(level=0, drop=True)
               .copy()
               *0
              ).astype('int')

x_test = df_all2['features_test']

#x_test.groupby('city').size()

In [None]:
predictions3['is_epidemic'] = False
predictions3.loc['sj', 'is_epidemic'] = mod3_sj.predict(x_test.loc['sj', :]).astype(int) # selected_features
predictions3.loc['iq', 'is_epidemic'] = mod3_iq.predict(x_test.loc['iq', :]).astype(int) # selected_features

# use predicted is_epidemic (unlike before)
x_test2 = pd.concat([x_test, predictions3['is_epidemic']], axis=1)

predictions3.loc['sj', 'total_cases'] = mod4_sj.predict(x_test2.loc['sj', :]).astype(int) # selected_features
predictions3.loc['iq', 'total_cases'] = mod4_iq.predict(x_test2.loc['iq', :]).astype(int) # selected_features

#predictions3['total_cases'] = predictions3['is_epidemic'].apply(lambda x: 50 if x==1 else 10)

In [None]:
submit = df_all2['submission'].copy()
# TODO Will this match indeces properly?
# submit['total_cases'] = predictions

del submit['total_cases']

submit = submit.merge(
    predictions3,
    left_index=True,
    right_index=True,
    how='left'
)
submit['total_cases'] = submit['total_cases'].fillna(value=0)

In [None]:
submit.shape

In [None]:
submit.groupby('city').head(n=2)

## plot

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission

In [None]:
make_submission(submit.reset_index())