Change from 3

- LSTM instead of RF
- normalizing data to [-1,+1]

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import time

In [None]:
# use 0.2 data
df_is_epidemic = pd.read_pickle('data/processed/0.2A-is_epidemic.pkl')

In [None]:
# load 5.1 data
df_targ = pd.read_pickle('data/processed/5.1B-df_targ.pkl')
df_feat_2 = pd.read_pickle('data/processed/5.1B-df_feat_2.pkl')
df_meta = pd.read_pickle('data/processed/5.1B-df_meta.pkl')

In [None]:
df_targ.shape, df_is_epidemic.shape

In [None]:
# note automatic index matching eventhough not same dimensions
df_targ['is_epidemic'] = df_is_epidemic['is_epidemic']

In [None]:
df_targ.tail(n=2)

In [None]:
df_is_epidemic.head(n=2)

## selected features

In [None]:
# all that make sense
selected_features = [x for x in df_feat_2.columns
                     if (x.endswith('_trend') and not x.startswith('weekofyear'))
                    or x=='weekofyear_original']

selected_features

## LSTM params

In [None]:
lahead = 10 # 60 yields no classification results
batch_size = 16 # smaller batches lead to less loss of data when truncating non-multiples of batch_size

## create rolling windows for LSTM

In [None]:
import numpy as np

In [None]:
def stride_group(group, n_back):
    out = []
    for i in range(n_back):
        out.append(group.shift(i).values)
        
    out = np.stack(out, axis=2)[(n_back-1):, :, :] # drop first lahead
    out = np.swapaxes(out, 1, 2)
    out = np.flip(out, axis=1) # so that the index=0 is the oldest, and index=4 is latest
    return out

stride_group_2 = lambda x: stride_group(x, lahead)

## drop 1st x rows if they are not a multiple of batch_size


In [None]:
def my_truncate(df):
    return (df.groupby(level='city', as_index=False)
              .apply(lambda group: group.tail(group.shape[0] - (group.shape[0]%batch_size)))
              .reset_index(level=0, drop=True)
            )

## prepare data to fit

In [None]:
# https://keras.io/layers/recurrent/#lstm
from keras.models import Sequential
from keras.layers import Dense, LSTM, Lambda, Dropout


In [None]:
def create_model():
    model = Sequential()
    model.add(LSTM(100,
              input_shape=(lahead, len(selected_features)),
              batch_size=batch_size,
              activation='linear'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.add(Dropout(0.2))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [None]:
df_train = df_feat_2.loc[~df_meta['submit']]

In [None]:
# note avoiding class bias
x_retrain = df_train[selected_features]
y_retrain = df_targ.loc[~df_meta['submit']][['is_epidemic']].astype('int') # [['total_cases']]
x_retrain.shape, y_retrain.shape, y_retrain.groupby('city').size()

In [None]:
y_retrain['is_epidemic'].loc['sj'].plot(label='sj')
(y_retrain['is_epidemic']+1.2).loc['iq'].plot(label='iq+1.2')
plt.legend()
plt.show()

In [None]:
xretrain_roll = x_retrain.groupby(level='city').apply(stride_group_2)
yretrain_roll = (y_retrain
                 .groupby(level='city', as_index=False)
                 .apply(lambda group: group.iloc[(lahead-1):])
                 .reset_index(level=0, drop=True)
                )

# drop lahead per city
for city in ['sj','iq']:
    to_drop = xretrain_roll.loc[city].shape[0]%batch_size
    print('drop non-multiple', city, to_drop)
    xretrain_roll.loc[city] = xretrain_roll.loc[city][(to_drop):]
    
yretrain_roll = my_truncate(yretrain_roll)


In [None]:
xretrain_roll.loc['sj'].shape, xretrain_roll.loc['iq'].shape, yretrain_roll.shape

## fit model

In [None]:
# initialize
mod1 = {}

In [None]:
for city in ['sj', 'iq']:
    #if city=='sj': continue
    print(city)
    #if city=='sj': continue # FIXME fitting sj model
    mod1[city] = create_model()
    #if city=='iq': continue # FIXME skipping iq model
    mod1[city].summary()
    

In [None]:
# actual fit
for city in ['sj', 'iq']:
    #if city=='sj': continue
    print(city)
    print(time.ctime(),'fit start')
    history = mod1[city].fit(
             xretrain_roll.loc[city],
             yretrain_roll.loc[city],
             batch_size=batch_size,
             epochs=250, #500, # 1000,
             verbose=2,
             #validation_data=None,
             shuffle=False
        )
    print(time.ctime(),'fit end')
    
    plt.plot(history.history['loss'], label='loss')
    #plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.title(city)
    plt.show()

## plot trained result

In [None]:
xretrain_roll.index

In [None]:
yretrain_roll.groupby('city').head()

In [None]:
def my_predict(city, np_in, index):
    np_pred = mod1[city].predict(np_in, batch_size=batch_size)
    out = pd.DataFrame({
        'is_epidemic': np_pred.squeeze(), 
        'city': city, 
        'week_of_year': index,
    }).set_index(['city', 'week_of_year'])
    return out

yretrain_pred = pd.concat([my_predict(city, xretrain_roll.loc[city], yretrain_roll.loc[city].index) for city in ['sj','iq']], axis=0)

# reverse log10 transform
# y_pred['total_cases'] = ((10**((y_pred['total_cases']).clip(upper=3)))-1).astype(int)

In [None]:
yretrain_pred.head()

In [None]:
for city in ['sj','iq']:
    (yretrain_pred.loc[city]['is_epidemic']).plot(label='predicted + 1.2')
    yretrain_roll.loc[city]['is_epidemic'].astype('int').plot(label='actual', figsize=(20,3))
    plt.show()

## predict `is_epidemic` on submission

In [None]:
x_submit = df_feat_2.loc[ df_meta['submit'], selected_features]

xsubmit_roll = x_submit.groupby(level='city').apply(stride_group_2)

# drop non-batch_size multiple
for city in ['sj','iq']:
    xsubmit_roll.loc[city] = xsubmit_roll.loc[city][(xsubmit_roll.loc[city].shape[0]%batch_size):]
    
# choose any field from x_submit just to get the index
ysubmit_roll = (x_submit[['weekofyear_original']]
                 .groupby(level='city', as_index=False)
                 .apply(lambda group: group.iloc[(lahead-1):])
                 .reset_index(level=0, drop=True)
                *0
                )    
ysubmit_roll = my_truncate(ysubmit_roll)

ysubmit_pred = pd.concat([my_predict(city, xsubmit_roll.loc[city], ysubmit_roll.loc[city].index) for city in ['sj','iq']], axis=0)

# reverse log10 transform
# y_pred['total_cases'] = ((10**((y_pred['total_cases']).clip(upper=3)))-1).astype(int)

In [None]:
for city in ['sj','iq']:
    (ysubmit_pred.loc[city]['is_epidemic']).plot(figsize=(20,3), label=city)

plt.title('submission')
plt.legend()
plt.show()

## save