Change from 3.0

- append diff of features for seasonality

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()

# replace with 0.2 output
df_all['labels_train'] = pd.read_pickle('data/processed/is_epidemic.pkl')

df_all.keys()

In [None]:
df_all['features_train'].head(n=2)

In [None]:
df_all['labels_train'].head(n=2)

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())
    print(df_all[k].shape)

## replace with seasonality decomposition

In [None]:
# http://www.statsmodels.org/stable/generated/statsmodels.tsa.seasonal.seasonal_decompose.html#statsmodels.tsa.seasonal.seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

def deseason(kkk):
    # kkk: 'features_train'
    selected_features = list(set(df_all[kkk].columns) - set(['year', 'weekofyear']))

    df_train = []
    for city in ['sj','iq']:
        df_interim = []
        for jjj in selected_features:
            res0 = df_all[kkk].loc[city, jjj]
            res1 = res0 - res0.mean(axis=0)
            res2 = seasonal_decompose(res1, freq=52, two_sided=False)
            res2 = pd.DataFrame({
                #'original': res0,
                'trend': res2.trend, 
                'seasonal': res2.seasonal, 
                'resid': res2.resid
            })

            res2.plot()
            plt.title("%s: %s"%(city,jjj))
            plt.show()

            res2['original'] = res0
            res2 = res2.rename(columns={
                'original': "%s_original"%jjj,
                'trend': "%s_trend"%jjj,
                'seasonal': "%s_seasonal"%jjj,
                'resid': "%s_resid"%jjj,
            })

            res2['city'] = city

            res2 = res2.reset_index().set_index(['city', 'week_start_date'])
            df_interim.append(res2)

        df_train.append(pd.concat(df_interim, axis=1))

    df_train = pd.concat(df_train, axis=0, sort=True)
    df_train = df_train[pd.notnull(df_train).all(axis=1)]
    
    return df_train

df_train = deseason('features_train')
df_test = deseason('features_test')
df_train.groupby('city').head()

In [None]:
# add back weekofyear
df_train['weekofyear'] = df_all['features_train']['weekofyear']
df_test['weekofyear'] = df_all['features_test']['weekofyear']

In [None]:
df_train.shape, df_all['features_train'].shape

In [None]:
assert ~(pd.isnull(df_train).any().any())

In [None]:
## save

In [None]:
fn1 = 'data/processed/deseasoned_%s.pkl'
for kkk in ['train','test']:
    import os
    fn2 = fn1%kkk
    if os.path.exists(fn2):
        raise ValueError('file exists: %s'%fn2)
        
df_train.to_pickle(fn1%'train')
df_test.to_pickle(fn1%'test')

## train/test split

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
# selected_features = df_all['features_train'].columns

# check no missing
# assert len(set(selected_features) - set(df_all['features_train'].columns))==0

#################################

# all original/trend/seasonal features
# selected_features = df_train.columns

# only trend + weekofyear
import numpy as np
selected_features = np.array([x for x in df_train.columns if x.endswith('_trend') or x=='weekofyear'])
selected_features

In [None]:
# split per city
x_train = ( # df_all['features_train']
           df_train
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = ( # df_all['features_train']
            df_train
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = ( # df_all['labels_train']
            df_all['labels_train'].loc[df_train.index]
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          # ['total_cases']
          ['is_epidemic'].astype('int')
         )
y_test = ( # df_all['labels_train']
            df_all['labels_train'].loc[df_train.index]
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          #['total_cases']
          ['is_epidemic'].astype('int')
         )

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

## fit RF

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
def create_model():
    # return RandomForestRegressor(n_estimators=100, min_samples_split=5, min_samples_leaf=3)
    return RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=3)


mod1_sj = create_model()
mod1_sj.fit(X = x_train.loc['sj'], y = y_train.loc['sj'])
mod1_iq = create_model()
mod1_iq.fit(X = x_train.loc['iq'], y = y_train.loc['iq'])

## check feature importances

In [None]:
#feat_imp = mod1_sj.feature_importances_
#feat_imp.sort()
#feat_imp, mod1_sj.feature_importances_, mod1_sj.feature_importances_.argsort(), 
xxx = mod1_sj.feature_importances_
xxx.sort()
xxx

In [None]:
# mod1_sj.feature_importances_.argsort().astype('int')
# selected_features
selected_features[mod1_sj.feature_importances_.argsort()]

In [None]:
xxx = mod1_iq.feature_importances_
xxx.sort()
xxx[-15:]

In [None]:
selected_features[mod1_iq.feature_importances_.argsort()[-15:]]

## common important features

In [None]:
selected_features[mod1_sj.feature_importances_.argsort()[-15:]] & selected_features[mod1_iq.feature_importances_.argsort()[-15:]]

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (y_test.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_test.loc['sj']).astype(int)
predictions.loc['iq'] = mod1_iq.predict(x_test.loc['iq']).astype(int)

predictions.loc['sj'].head()

In [None]:
'sj', mod1_sj.score(x_test.loc['sj'], y_test.loc['sj']), 'iq', mod1_iq.score(x_test.loc['iq'], y_test.loc['iq'])

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## re-fit on complete dataset

In [None]:
# note avoiding class bias
x_retrain = df_all['features_train'][selected_features]
y_retrain = df_all['labels_train']['total_cases']

mod1_sj = create_model()
mod1_sj.fit(X = x_retrain.loc['sj'], y = y_retrain.loc['sj'])
mod1_iq = create_model()
mod1_iq.fit(X = x_retrain.loc['iq'], y = y_retrain.loc['iq'])

## set in submission

In [None]:
df_all['submission'].loc['sj'].head()

In [None]:
# cast to int since we know the label is integer
predictions = (df_all['submission'][['total_cases']]
               .groupby(level='city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
               .copy()
               *0
              ).astype('int')

predictions.loc['sj', 'total_cases'] = mod1_sj.predict(df_all['features_test'].loc['sj', selected_features]).astype(int)
predictions.loc['iq', 'total_cases'] = mod1_iq.predict(df_all['features_test'].loc['iq', selected_features]).astype(int)

In [None]:
predictions.groupby(level='city').head(n=2)

In [None]:
submit = df_all['submission'].copy()
# TODO Will this match indeces properly?
# submit['total_cases'] = predictions

del submit['total_cases']

submit = submit.merge(
    predictions,
    left_index=True,
    right_index=True,
    how='left'
)
submit['total_cases'] = submit['total_cases'].fillna(value=0)

In [None]:
submit.shape

In [None]:
submit.groupby('city').head(n=2)

## plot

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission