Change from 3

- LSTM instead of RF
- normalizing data to [-1,+1]

For reference, check https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb

In [None]:
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()
df_all.keys()

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())

## append without seasonality

Copied from notebook 3.1

In [None]:
n_diff = 1
for k in ['features_train', 'features_test']:
    temp_no = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.diff(periods=n_diff).iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    temp_no.columns = ["%s_diff"%x for x in temp_no.columns]
    assert ~(pd.isnull(temp_no).any().any())
    
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = pd.concat([temp_yes, temp_no], axis=1)
    print(df_all[k].shape)


In [None]:
for k in ['features_train', 'features_test']:
    assert ~(pd.isnull(df_all[k]).any().any())

In [None]:
# drop n_diff points from labels as well
for k in ['labels_train']:
    temp_yes = (df_all[k]
               .groupby('city', as_index=False)
               .apply(lambda group: group.iloc[n_diff:])
               .reset_index(level=0, drop=True)
              )
    
    df_all[k] = temp_yes
    print(df_all[k].shape)


## selected features

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
# selected_features = df_all['features_train'].columns

# from RF feature importances
# selected_features = ['station_max_temp_c', 'reanalysis_dew_point_temp_k',
#        'reanalysis_specific_humidity_g_per_kg', 'year', 'weekofyear',
#        'ndvi_sw', 'ndvi_se']

# from RF with diff
selected_features = ['reanalysis_avg_temp_k_diff', 'station_avg_temp_c', 'ndvi_se_diff',
       'station_max_temp_c', 'reanalysis_dew_point_temp_k',
       'reanalysis_specific_humidity_g_per_kg', 'year', 'weekofyear',
       'ndvi_sw', 'ndvi_se']

assert len(set(selected_features) - set(df_all['features_train'].columns))==0

In [None]:
df_all['features_train'].shape, df_all['labels_train'].shape

## train/test split

In [None]:
# note avoiding class bias
x_train = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all['features_train']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [['total_cases']]
         )
y_test = (df_all['labels_train']
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [['total_cases']]
         )

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

## normalize data to [-1,1]

In [None]:
from sklearn.preprocessing import MinMaxScaler
def my_scale(df1):
    scaler = MinMaxScaler()
    df2 = scaler.fit_transform(df1)
    df2 = pd.DataFrame(df2, columns=df1.columns, index=df1.index)
    return df2, scaler

xtrain_scaled, scaler_xtrain = my_scale(x_train)
xtest_scaled, scaler_xtest = my_scale(x_test)
ytrain_scaled, scaler_ytrain = my_scale(y_train)
ytest_scaled, scaler_ytest = my_scale(y_test)

xtrain_scaled.shape, xtest_scaled.shape, ytrain_scaled.shape, ytest_scaled.shape

## LSTM params

In [None]:
lahead = 5
batch_size = 32

## create rolling windows for LSTM

In [None]:
import numpy as np

In [None]:
def stride_group(group):
    out = []
    for i in range(lahead):
        out.append(group.shift(i).values)
        
    out = np.stack(out, axis=2)[5:, :, :] # drop first lahead
    out = np.swapaxes(out, 1, 2)
    return out

    
xtrain_roll = xtrain_scaled.groupby(level='city').apply(stride_group)
xtest_roll  = xtest_scaled.groupby(level='city').apply(stride_group)
xtrain_roll.shape, xtest_roll.shape

In [None]:
xtrain_roll.loc['sj'].shape, xtest_roll.loc['sj'].shape, xtrain_roll.loc['iq'].shape, xtest_roll.loc['iq'].shape

In [None]:
xtrain_roll.loc['sj'][:3,:3,:3]

In [None]:
ytrain_roll = ytrain_scaled.groupby(level='city', as_index=False).apply(lambda group: group.iloc[5:]).reset_index(level=0, drop=True)
ytest_roll = ytest_scaled.groupby(level='city', as_index=False).apply(lambda group: group.iloc[5:]).reset_index(level=0, drop=True)
ytrain_roll.shape, ytest_roll.shape

## WIP drop 1st x rows if they are not a multiple of batch_size


In [None]:
xtrain_roll.loc['sj'].shape, batch_size, xtrain_roll.loc['sj'].shape[0]%(32*batch_size)

In [None]:
for city in ['sj','iq']:
    xtrain_roll.loc[city] = xtrain_roll.loc[city][(xtrain_roll.loc[city].shape[0]%batch_size):]
    ytrain_roll.loc[city] = ytrain_roll.loc[city][(ytrain_roll.loc[city].shape[0]%batch_size):]
    xtest_roll.loc[city] = xtest_roll.loc[city][(xtest_roll.loc[city].shape[0]%batch_size):]
    ytest_roll.loc[city] = ytest_roll.loc[city][(ytest_roll.loc[city].shape[0]%batch_size):]

In [None]:
[(xtrain_roll.loc[city].shape, xtest_roll.loc[city].shape) for city in ['sj','iq']]

In [None]:
[(ytrain_roll.loc[city].shape, ytest_roll.loc[city].shape) for city in ['sj','iq']]

In [None]:
696%32, ytrain_roll.loc['sj'].shape[0], batch_size

## fit LSTM

In [None]:
# https://keras.io/layers/recurrent/#lstm
from keras.models import Sequential
from keras.layers import Dense, LSTM


In [None]:
def create_model():
    model = Sequential()
    model.add(LSTM(100,
              input_shape=(lahead, len(selected_features)),
              batch_size=batch_size))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer='adam')
    return model

In [None]:
mod1 = {}
for city in ['sj', 'iq']:
    print(city)
    mod1[city] = create_model()
    print(mod1[city].summary())
    mod1[city].fit(xtrain_roll.loc[city],
             ytrain_roll.loc[city],
             batch_size=batch_size,
             epochs=10,
             verbose=1,
             validation_data=(xtest_roll.loc[city], ytest_roll.loc[city]),
             shuffle=False)

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (ytest_roll.copy()*0).astype('int')

predictions.loc['sj'] = mod1['sj'].predict(xtest_roll.loc['sj'], batch_size=batch_size)#.astype(int)
predictions.loc['iq'] = mod1['iq'].predict(xtest_roll.loc['iq'], batch_size=batch_size)#.astype(int)

# FIXME cannot really apply scaler_ytest on the predictions
predictions.loc[:] = scaler_ytest.inverse_transform(predictions).astype(int)

predictions.loc['sj'].head()

In [None]:
[(city, mod1[city].evaluate(xtest_roll.loc[city], ytest_roll.loc[city], batch_size=batch_size)) for city in ['sj','iq']]

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## set in submission

In [None]:
df_all['submission'].loc['sj'].head()

In [None]:
# cast to int since we know the label is integer
predictions = (df_all['submission'][['total_cases']]
               .groupby(level='city', as_index=False)
               .apply(lambda group: group.iloc[(lahead+n_diff+1):])
               .reset_index(level=0, drop=True)
               .copy()
               *0
              ).astype('int')

x_submit = (df_all['features_test']
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.iloc[n_diff:])
          .reset_index(level=0, drop=True)
          [selected_features]
          )
xsubmit_scaled, scaler_xsubmit = my_scale(x_submit)
xsubmit_roll = xsubmit_scaled.groupby(level='city').apply(stride_group)

print(
    predictions.loc['sj', 'total_cases'].shape,
    mod1['sj'].predict(xsubmit_roll.loc['sj'], batch_size=batch_size).shape
)

predictions.loc['sj', 'total_cases'] = mod1['sj'].predict(xsubmit_roll.loc['sj'], batch_size=batch_size)
predictions.loc['iq', 'total_cases'] = mod1['iq'].predict(xsubmit_roll.loc['iq'], batch_size=batch_size)

# FIXME cannot really apply scaler_ytest on the predictions
predictions.loc[:] = scaler_ytest.inverse_transform(predictions).astype(int)

print(predictions.shape)

predictions.loc['sj'].head()

In [None]:
predictions.groupby(level='city').head(n=10)

In [None]:
submit = df_all['submission'].copy()
# TODO if this matches indeces properly, review the complicated merge in 3.1
submit['total_cases'] = predictions
submit = submit.fillna(value=0)

In [None]:
submit.groupby(level='city').head(n=10)

## plot

In [None]:
for city in ['sj','iq']:
    plt.plot(submit.loc[city, 'total_cases'].values, label=city)
    
plt.legend()
plt.show()

## Compare to result of notebook 1-...ipynb

In [None]:
#fn_prev = '1-submission_20180530_092740-score_29.csv'
fn_prev = '3.0-submission_20180530_141052.csv'
df_prev = (pd.read_csv('data/interim/%s'%fn_prev)
             .merge(submit.reset_index(), how='left', on=['city', 'year', 'weekofyear'], suffixes=['_prev', '_curr'])
          ).set_index(['city', 'week_start_date'])
df_prev.head()

In [None]:
for city in ['sj', 'iq']:
    plt.plot(df_prev.loc[city, 'total_cases_prev'].values, label='prev')
    plt.plot(df_prev.loc[city, 'total_cases_curr'].values, label='curr')
    plt.title(city)
    plt.legend()
    plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission