Change from 3

- LSTM instead of RF
- normalizing data to [-1,+1]

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import time

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()
df_all.keys()

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())

## append without seasonality

Copied from notebook 3.1

In [None]:
n_diff = 1
for k in ['features_train', 'features_test']:
    temp_no = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.diff(periods=n_diff).iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    temp_no.columns = ["%s_diff"%x for x in temp_no.columns]
    assert ~(pd.isnull(temp_no).any().any())
    
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = pd.concat([temp_yes, temp_no], axis=1)
    print(df_all[k].shape)


In [None]:
for k in ['features_train', 'features_test']:
    assert ~(pd.isnull(df_all[k]).any().any())

In [None]:
# drop n_diff points from labels as well
for k in ['labels_train']:
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = temp_yes
    print(df_all[k].shape)


## selected features

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
# selected_features = df_all['features_train'].columns

# from RF feature importances
# selected_features = ['station_max_temp_c', 'reanalysis_dew_point_temp_k',
#        'reanalysis_specific_humidity_g_per_kg', 'year', 'weekofyear',
#        'ndvi_sw', 'ndvi_se']

# from RF with diff
selected_features = ['reanalysis_avg_temp_k_diff', 'station_avg_temp_c', 'ndvi_se_diff',
       'station_max_temp_c', 'reanalysis_dew_point_temp_k',
       'reanalysis_specific_humidity_g_per_kg', 'year', 'weekofyear',
       'ndvi_sw', 'ndvi_se']

assert len(set(selected_features) - set(df_all['features_train'].columns))==0

In [None]:
df_all['features_train'].shape, df_all['labels_train'].shape

## train/test split

In [None]:
# note avoiding class bias
x_train = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [['total_cases']]
         )
y_test = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [['total_cases']]
         )

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

## normalize data to [-1,1]

In [None]:
from sklearn.preprocessing import MinMaxScaler
def my_scale(df1):
    scaler = MinMaxScaler()
    df2 = scaler.fit_transform(df1)
    df2 = pd.DataFrame(df2, columns=df1.columns, index=df1.index)
    return df2, scaler

xtrain_scaled, scaler_xtrain = my_scale(x_train)
xtest_scaled, scaler_xtest = my_scale(x_test)
#ytrain_scaled, scaler_ytrain = my_scale(y_train)
#ytest_scaled, scaler_ytest = my_scale(y_test)
ytrain_scaled = y_train
scaler_ytrain = None
ytest_scaled = y_test
scaler_ytest = None

xtrain_scaled.shape, xtest_scaled.shape, ytrain_scaled.shape, ytest_scaled.shape

## LSTM params

In [None]:
lahead = 10
batch_size = 16 # smaller batches lead to less loss of data when truncating non-multiples of batch_size

## create rolling windows for LSTM

In [None]:
import numpy as np

In [None]:
def stride_group(group):
    out = []
    for i in range(lahead):
        out.append(group.shift(i).values)
        
    out = np.stack(out, axis=2)[lahead:, :, :] # drop first lahead
    out = np.swapaxes(out, 1, 2)
    out = np.flip(out, axis=1) # so that the index=0 is the oldest, and index=4 is latest
    return out

    
xtrain_roll = xtrain_scaled.groupby(level='city').apply(stride_group)
xtest_roll  = xtest_scaled.groupby(level='city').apply(stride_group)

In [None]:
# for the target, drop first lahead points, without any rolling window
ytrain_roll = (ytrain_scaled
                .groupby(level='city', as_index=False)
                .apply(lambda group: group.iloc[lahead:])
                .reset_index(level=0, drop=True)
                )
ytest_roll = (ytest_scaled
              .groupby(level='city', as_index=False)
              .apply(lambda group: group.iloc[lahead:])
              .reset_index(level=0, drop=True)
             )

In [None]:
[(city, [df.loc[city].shape for df in (xtrain_roll, xtest_roll, ytrain_roll, ytest_roll)]) for city in ['sj','iq']]

In [None]:
ytrain_roll.groupby('city').size(), ytest_roll.groupby('city').size()

## WIP drop 1st x rows if they are not a multiple of batch_size


In [None]:
for city in ['sj','iq']:
    xtrain_roll.loc[city] = xtrain_roll.loc[city][(xtrain_roll.loc[city].shape[0]%batch_size):]
    xtest_roll.loc[city] = xtest_roll.loc[city][(xtest_roll.loc[city].shape[0]%batch_size):]
    
def my_truncate(df):
    return (df.groupby(level='city', as_index=False)
              .apply(lambda group: group.tail(group.shape[0] - (group.shape[0]%batch_size)))
              .reset_index(level=0, drop=True)
            )

ytrain_roll = my_truncate(ytrain_roll)
ytest_roll = my_truncate(ytest_roll)
    

In [None]:
ytrain_roll.groupby('city').size(), ytest_roll.groupby('city').size()

In [None]:
[(city, [df.loc[city].shape for df in (xtrain_roll, xtest_roll, ytrain_roll, ytest_roll)]) for city in ['sj','iq']]

## verify data consistency between raw / scaled

In [None]:
x_train.loc['sj'].head(n=3)

In [None]:
xtrain_scaled.loc['sj'].head(n=3)

In [None]:
ytrain_scaled.loc['sj'].head(n=3)

In [None]:
y_train.loc['sj'].head(n=3)

## verify data consistency between scaled / roll

In [None]:
xtrain_scaled.loc['sj'].head(n=24+10).tail(n=10)

In [None]:
xtrain_roll.loc['sj'][:3,:,0]

In [None]:
ytrain_roll.loc['sj'].head(n=3)

## fit LSTM

In [None]:
# https://keras.io/layers/recurrent/#lstm
from keras.models import Sequential
from keras.layers import Dense, LSTM, Lambda, Dropout


In [None]:
def create_model():
    model = Sequential()
    model.add(LSTM(100,
              input_shape=(lahead, len(selected_features)),
              batch_size=batch_size,
              activation='linear'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.add(Dropout(0.2))
    model.add(Lambda(lambda x: x*10)) # TODO x*30 caused the re-fit on complete dataset to blow up
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
mod1 = {}
for city in ['sj', 'iq']:
    print(city)
    mod1[city] = create_model()
    mod1[city].summary()
    print(time.ctime(),'fit start')
    history = mod1[city].fit(xtrain_roll.loc[city],
             ytrain_roll.loc[city],
             batch_size=batch_size,
             epochs=1000,
             verbose=0,
             validation_data=(xtest_roll.loc[city], ytest_roll.loc[city]),
             shuffle=False)
    print(time.ctime(),'fit end')
    
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.show()

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (ytest_roll.copy()*0).astype('int')

predictions.loc['sj'] = mod1['sj'].predict(xtest_roll.loc['sj'], batch_size=batch_size)#.astype(int)
predictions.loc['iq'] = mod1['iq'].predict(xtest_roll.loc['iq'], batch_size=batch_size)#.astype(int)

# FIXME cannot really apply scaler_ytest on the predictions
# predictions.loc[:] = scaler_ytest.inverse_transform(predictions).astype(int)

predictions.loc['sj'].head()

In [None]:
[(city, mod1[city].evaluate(xtest_roll.loc[city], ytest_roll.loc[city], batch_size=batch_size)) for city in ['sj','iq']]

## plot

In [None]:
for city in ['sj', 'iq']:
    df_plot = y_test.merge(predictions, left_index=True, right_index=True, suffixes=['_actual','_predicted'])
    df_plot.loc[city].plot(figsize=(20,5))
    plt.title(city)
    plt.legend()
    plt.show()

## re-fit on complete dataset

In [None]:
x_retrain = df_all['features_train'][selected_features]
y_retrain = df_all['labels_train'][['total_cases']]

xretrain_scaled, scaler_xretrain = my_scale(x_retrain)
# yretrain_scaled, scaler_yretrain = my_scale(y_retrain)
yretrain_scaled = y_retrain
scaler_yretrain = None

xretrain_roll = xretrain_scaled.groupby(level='city').apply(stride_group)
yretrain_roll = (yretrain_scaled
                 .groupby(level='city', as_index=False)
                 .apply(lambda group: group.iloc[lahead:])
                 .reset_index(level=0, drop=True)
                )

for city in ['sj','iq']:
    xretrain_roll.loc[city] = xretrain_roll.loc[city][(xretrain_roll.loc[city].shape[0]%batch_size):]
    
yretrain_roll = my_truncate(yretrain_roll)

mod1 = {}
for city in ['sj', 'iq']:
    print(city)
    mod1[city] = create_model()
    mod1[city].summary()
    print(time.ctime(),'fit start')
    history = mod1[city].fit(
             xretrain_roll.loc[city],
             yretrain_roll.loc[city],
             batch_size=batch_size,
             epochs=1000,
             verbose=0,
             #validation_data=None,
             shuffle=False)
    print(time.ctime(),'fit end')
    
    plt.plot(history.history['loss'], label='loss')
    #plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.title(city)
    plt.show()

## set in submission

In [None]:
df_all['submission'].loc['sj'].head()

In [None]:
x_submit = (df_all['features_test']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.iloc[n_diff:])
          .reset_index(level=0, drop=True)
          [selected_features]
          )
xsubmit_scaled, scaler_xsubmit = my_scale(x_submit)
xsubmit_roll = xsubmit_scaled.groupby(level='city').apply(stride_group)


for city in ['sj','iq']:
    xsubmit_roll.loc[city] = xsubmit_roll.loc[city][(xsubmit_roll.loc[city].shape[0]%batch_size):]

In [None]:
predictions = (df_all['submission'][['total_cases']]
               .groupby(level='city', as_index=False)
               .apply(lambda group: group.iloc[(lahead+n_diff+1):])
               .reset_index(level=0, drop=True)
               .copy()
               *0
              ).astype('int')

def my_predict(city):
    np_pred = mod1[city].predict(xsubmit_roll.loc[city], batch_size=batch_size)
    d1 = predictions.loc[city].shape[0]
    d2 = xsubmit_roll.loc[city].shape[0]
    return np.concatenate([np.zeros((d1-d2,1)), np_pred], axis=0)

predictions.loc['sj', 'total_cases'] = my_predict('sj')
predictions.loc['iq', 'total_cases'] = my_predict('iq')

# FIXME cannot really apply scaler_ytest on the predictions
#predictions.loc[:] = scaler_ytest.inverse_transform(predictions).astype(int)
predictions['total_cases'] = predictions['total_cases'].astype(int)

In [None]:
submit = df_all['submission'].copy()
# TODO if this matches indeces properly, review the complicated merge in 3.1
submit['total_cases'] = predictions
submit = submit.fillna(value=0)
submit['total_cases'] = submit['total_cases'].astype('int')

## plot

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission

In [None]:
make_submission(submit.reset_index())