Change from 2.0

- each city modeled separately
- refit on whole dataset

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()
df_all.keys()

In [None]:
df_all['features_train'].head(n=2)

In [None]:
df_all['labels_train'].head(n=2)

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())

## train/test split

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
selected_features = ['reanalysis_specific_humidity_g_per_kg', 
                 'reanalysis_dew_point_temp_k', 
                 'station_avg_temp_c', 
                 'station_min_temp_c']
assert len(set(selected_features) - set(df_all['features_train'].columns))==0

In [None]:
df_all['features_train'].shape, df_all['labels_train'].shape

In [None]:
# note avoiding class bias
x_train = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )
y_test = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
         )

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head()

In [None]:
set(y_train.reset_index()['city'])

## fit

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
mod1_sj = sm.GLM(exog  = x_train.loc['sj'], endog = y_train.loc['sj'], family=sm.families.NegativeBinomial()).fit()
mod1_sj.summary()

In [None]:
mod1_iq = sm.GLM(exog  = x_train.loc['iq'], endog = y_train.loc['iq'], family=sm.families.NegativeBinomial()).fit()
mod1_iq.summary()

## predict on test set

In [None]:
from statsmodels.tools import eval_measures
import numpy as np

In [None]:
x_test.shape

In [None]:
# cast to int since we know the label is integer
predictions = (y_test.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_test.loc['sj']).astype(int).values
predictions.loc['iq'] = mod1_iq.predict(x_test.loc['iq']).astype(int).values

predictions.loc['sj'].head()

In [None]:
'sj', eval_measures.meanabs(predictions.loc['sj'], y_test.loc['sj']), 'iq', eval_measures.meanabs(predictions.loc['iq'], y_test.loc['iq'])

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## set in submission

In [None]:
df_all['submission'].loc['sj'].head()

In [None]:
# cast to int since we know the label is integer
predictions = (df_all['submission']['total_cases'].copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(df_all['features_test'].loc['sj', selected_features]).astype(int).values
predictions.loc['iq'] = mod1_iq.predict(df_all['features_test'].loc['iq', selected_features]).astype(int).values

predictions.loc['sj'].head()

In [None]:
predictions.groupby(level='city').head()

In [None]:
submit = df_all['submission'].copy()
submit['total_cases'] = predictions

In [None]:
submit.shape

In [None]:
submit.head()

## plot

In [None]:
# notebook 2.0

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## Compare to result of notebook 1-...ipynb

In [None]:
df_prev = (pd.read_csv('data/interim/1-submission_20180530_092740-score_29.csv')
             .merge(submit.reset_index(), how='left', on=['city', 'year', 'weekofyear'], suffixes=['_prev', '_curr'])
          ).set_index(['city', 'week_start_date'])
df_prev.head()

In [None]:
# notebook 2.0

In [None]:
for city in ['sj', 'iq']:
    plt.plot(df_prev.loc[city, 'total_cases_prev'].values, label='prev')
    plt.plot(df_prev.loc[city, 'total_cases_curr'].values, label='curr')
    plt.title(city)
    plt.legend()
    plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission

In [None]:
make_submission(submit.reset_index())