In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import time

# https://keras.io/layers/recurrent/#lstm
from keras.models import Sequential
from keras.layers import Dense, LSTM, Lambda, Dropout, Embedding, Flatten

# https://keras.io/layers/recurrent/#lstm
from keras.models import Model
from keras.layers import Input, RepeatVector, TimeDistributed, Concatenate

import numpy as np

In [None]:
# use 0.2 data
df_is_epidemic = pd.read_pickle('data/processed/0.2A-is_epidemic.pkl')

In [None]:
# load 5.1 data
df_targ = pd.read_pickle('data/processed/5.1B-df_targ.pkl')
df_feat_2 = pd.read_pickle('data/processed/5.1B-df_feat_2.pkl')
df_meta = pd.read_pickle('data/processed/5.1B-df_meta.pkl')

# match indeces
df_meta = df_meta.loc[df_targ.index]

In [None]:
df_targ.shape, df_is_epidemic.shape

In [None]:
# note automatic index matching eventhough not same dimensions
df_targ['is_epidemic'] = df_is_epidemic['is_epidemic']

In [None]:
df_targ.tail(n=2)

In [None]:
df_is_epidemic.head(n=2)

## selected features

In [None]:
# all that make sense
selected_features = [x for x in df_feat_2.columns if
                     # (x.endswith('_trend') and not x.startswith('weekofyear')) or x=='weekofyear_original'
                     # x.endswith('_original') and not x.startswith('weekofyear')
                     x.endswith('_trend') and not x.startswith('weekofyear')
                    ]

selected_features

## LSTM params

In [None]:
lahead = 10 # 60 yields no classification results
batch_size = 16 # smaller batches lead to less loss of data when truncating non-multiples of batch_size

## create rolling windows for LSTM

In [None]:
def stride_group(group, n_back):
    out = []
    for i in range(n_back):
        out.append(group.shift(i).values)
        
    out = np.stack(out, axis=2)[(n_back-1):, :, :] # drop first lahead
    out = np.swapaxes(out, 1, 2)
    out = np.flip(out, axis=1) # so that the index=0 is the oldest, and index=4 is latest
    return out

stride_group_2 = lambda x: stride_group(x, lahead)

## drop 1st x rows if they are not a multiple of batch_size


In [None]:
def my_truncate(df):
    return (df.groupby(level='city', as_index=False)
              .apply(lambda group: group.tail(group.shape[0] - (group.shape[0]%batch_size)))
              .reset_index(level=0, drop=True)
            )

## prepare data to fit

In [None]:
df_train = df_feat_2.loc[~df_meta['submit']]

In [None]:
x_retrain = df_train[selected_features].copy()
y_retrain = df_targ[~df_meta['submit']].copy()
y_retrain['is_epidemic'] = y_retrain['is_epidemic'].astype('int') # [['total_cases']]
x_retrain.shape, y_retrain.shape, y_retrain.groupby('city').size()

In [None]:
y_retrain['is_epidemic'].loc['sj'].plot(label='sj')
(y_retrain['is_epidemic']+1.2).loc['iq'].plot(label='iq+1.2')
plt.legend()
plt.show()

## label each epidemic event

In [None]:
y_retrain['epidemic_id'] = ((y_retrain['is_epidemic'].astype('int').diff()+1)//2).fillna(value=0).cumsum(axis=0)
y_retrain.loc[~y_retrain['is_epidemic'].astype(bool), 'epidemic_id'] = np.nan

In [None]:
y_retrain[~y_retrain['is_epidemic'].astype(bool)].head(n=2)

In [None]:
y_retrain[ y_retrain['is_epidemic'].astype(bool)].head(n=2)

In [None]:
y_retrain[ y_retrain['is_epidemic'].astype(bool)].tail(n=2)

In [None]:
y_retrain.groupby('epidemic_id').size()

## filter

In [None]:
xretrain_roll = x_retrain.groupby(level='city').apply(stride_group_2)

# drop lahead per city
yretrain_roll = (y_retrain
                 .groupby(level='city', as_index=False)
                 .apply(lambda group: group.iloc[(lahead-1):])
                 .reset_index(level=0, drop=True)
                )

In [None]:
# drop non-batchsize-multiple per city
for city in ['sj','iq']:
    to_drop = xretrain_roll.loc[city].shape[0]%batch_size
    print('drop non-multiple', city, to_drop)
    xretrain_roll.loc[city] = xretrain_roll.loc[city][(to_drop):]
    
yretrain_roll = my_truncate(yretrain_roll)

In [None]:
xretrain_roll.loc['sj'].shape, xretrain_roll.loc['iq'].shape, yretrain_roll.shape

## calculate epidemic max amp

In [None]:
yretrain_epidemicmax = (
    yretrain_roll[['epidemic_id', 'total_cases']]
    .groupby('epidemic_id')
    .max()
    .reset_index()
    .rename(columns={'total_cases': 'epidemic_max'})
)
yretrain_epidemicmax

In [None]:
yretrain_roll = yretrain_roll.reset_index().merge(
    yretrain_epidemicmax,
    on = 'epidemic_id',
    how='left'
).set_index(['city', 'week_start_date'])

In [None]:
yretrain_roll.head(n=2)

In [None]:
yretrain_roll[yretrain_roll['epidemic_id']==2].head(n=2)

## ~~fit model: lstm regression on epidemic_max or total_cases~~

Both only detected the global average

## plot trained result

## fit model: AE coupled with regression on target

In [None]:
def create_coupled():
    lstm_dim_1 = 15
    len_feat = len(selected_features)
    input_shape = (lahead, len_feat, )

    # inputs
    feat_raw = Input(shape=input_shape, name='raw_features')
    
    # encoder
    feat_enc = feat_raw
    feat_enc = LSTM(
              lstm_dim_1,
              #input_shape=input_shape,
              batch_size=batch_size,
              return_sequences=False,
              activation='tanh',
              name='encoded_features')(feat_enc)

    # decoder
    feat_rec = feat_enc
    feat_rec = RepeatVector(lahead, input_shape=(lstm_dim_1, ))(feat_rec)
    feat_rec = LSTM(lstm_dim_1,
              #input_shape=(lahead, len(selected_features)),
              batch_size=batch_size,
              return_sequences=True,
              dropout=0.2,
              activation='tanh')(feat_rec)
    feat_rec = TimeDistributed(
        Dense(len_feat, activation='linear'),
        name='reconstructed_features'
    )(feat_rec)

    # append to encoded features
    # 2 meta features: is_epidemic and weekofyear
    is_epidemic = Input(shape=(1, ), name='is_epidemic')
    """
    embed_epi = is_epidemic
    # 2 is vocabulary length, i.e. (0,1)
    # 4 is dimensions to use in embedding
    embed_epi = Embedding(2, 4, input_length=1, name='embed_epi_matrix')(embed_epi)
    embed_epi = Flatten(name='embed_epi_flat')(embed_epi)
    """

    weekofyear = Input(shape=(1, ), name='weekofyear')
    embed_woy = weekofyear
    # 53+1 is vocabulary length ... remember that weekofyear is not 0-based
    # 4 is dimensions to use in embedding
    embed_woy = Embedding(53+1, 4, input_length=1, name='embed_woy_matrix')(embed_woy)
    embed_woy = Flatten(name='embed_woy_flat')(embed_woy)

    feat_enc_and_meta = Concatenate(axis=-1, name='enc_and_meta')([feat_enc, embed_woy]) # embed_epi

    # regressor
    out = feat_enc_and_meta # feat_enc
    out = Dense(5, activation='relu')(out)
    out = Dense(1, activation='linear')(out)
    out = Lambda(lambda x: x*10, name='regressed_output')(out)
    
    # create model
    model_all = Model(inputs = [feat_raw, is_epidemic, weekofyear], outputs = [feat_rec, out])
    model_all.compile(loss='mae', optimizer='adam')
    return model_all

In [None]:
# initialize
mod2 = {}

In [None]:
for city in ['sj', 'iq']:
    print(city)
    #if city=='sj': continue # FIXME fitting sj model
    mod2[city] = create_coupled()
    #if city=='iq': continue # FIXME skipping iq model
    mod2[city].summary()
    

In [None]:
# actual fit
for city in ['sj', 'iq']:
    # if city=='iq': continue # FIXME skipping iq for now
    print(city)
    print(time.ctime(),'fit start')
    history = mod2[city].fit(
             {   # ...[yretrain_roll['is_epidemic']], to only train on subset of epidemics
                 'raw_features': xretrain_roll.loc[city],
                 #[yretrain_roll['is_epidemic']],
                 # 'epidemic_max'
                 'is_epidemic': yretrain_roll.loc[city, ['is_epidemic']],
                 'weekofyear': yretrain_roll.loc[city, ['weekofyear']],
             },
             {   'reconstructed_features': xretrain_roll.loc[city], #[yretrain_roll['is_epidemic']],
                 'regressed_output': yretrain_roll.loc[city, 'total_cases'], #[yretrain_roll['is_epidemic']], # epidemic_max
             },
             batch_size=batch_size,
             epochs=300, #250, #500, # 1000,
             initial_epoch = 200,
             verbose=2,
             #validation_data=None,
             shuffle=False
        )
    print(time.ctime(),'fit end')
    
    # ignore first few points since large relative to others
    plt.plot(history.history['loss'][5:], label='loss')
    #plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.title(city)
    plt.show()

## save model

## plot trained result

In [None]:
def my_predict(city, np_in, index):
    np_rec, np_pred = mod2[city].predict(np_in, batch_size=batch_size)
    
    for feat_int in range(len(selected_features)):
        pd.DataFrame({
            'actual': pd.Series(np_in['raw_features'][:,0,feat_int], index=index),
            'pred': pd.Series(np_rec[:,0,feat_int],                index=index),
        }).plot(figsize=(20,3))
        plt.title('%s / feat %i:'%(city, feat_int))
        plt.legend()
        plt.show()
    
    out = pd.DataFrame({
        'epidemic_max': np_pred.squeeze(), 
        'city': city, 
        'week_start_date': index,
    }).set_index(['city', 'week_start_date'])
    return out

yretrain_pred = pd.concat([
    my_predict(
        city, 
        {   'raw_features': xretrain_roll.loc[city],#[yretrain_roll.loc[city, 'is_epidemic'].astype('bool')], 
            'is_epidemic': yretrain_roll.loc[city, ['is_epidemic']],
            'weekofyear':  yretrain_roll.loc[city, ['weekofyear']],
        },
        yretrain_roll.loc[city].index,#[yretrain_roll.loc[city, 'is_epidemic'].astype('bool')].index
    )
    for city in ['sj','iq']
], axis=0)

# reverse log10 transform
# y_pred['total_cases'] = ((10**((y_pred['total_cases']).clip(upper=3)))-1).astype(int)

In [None]:
for city in ['sj','iq']:
    (yretrain_pred.loc[city]['epidemic_max']).plot(label='predicted', style='.')
    # epidemic_max
    yretrain_roll.loc[city]['total_cases'].astype('int').plot(label='actual', figsize=(20,3), style='.')
    plt.legend()
    plt.show()

## load predicted `is_epidemic` for submission

In [None]:
isepi_pred = pd.read_pickle('data/processed/4.1A-ysubmit_pred.pkl')
# fix index name
isepi_pred = isepi_pred.reset_index().rename(columns={'week_of_year': 'week_start_date'})
# append weekofyear
df_dates = df_targ.reset_index()[['week_start_date','weekofyear']]
df_dates = df_dates[~df_dates.duplicated()]
isepi_pred = isepi_pred.merge(df_dates, how='left', on='week_start_date')
# set index again
isepi_pred = isepi_pred.set_index(['city', 'week_start_date'])
# threshold probability
isepi_pred['is_epidemic'] = isepi_pred['is_epidemic'].apply(lambda x: x>=0.5).astype('int')

isepi_pred.shape

## predict `is_epidemic` on submission

In [None]:
x_submit = df_feat_2.loc[ df_meta['submit'], selected_features].copy()

xsubmit_roll = x_submit.groupby(level='city').apply(stride_group_2)

# drop non-batch_size multiple
for city in ['sj','iq']:
    to_drop = xsubmit_roll.loc[city].shape[0]%batch_size
    print('non multiple', city, to_drop)
    xsubmit_roll.loc[city] = xsubmit_roll.loc[city][to_drop:]
    
# choose any field from x_submit just to get the index
ysubmit_roll = (x_submit[x_submit.columns[:1]]
                 .groupby(level='city', as_index=False)
                 .apply(lambda group: group.iloc[(lahead-1):])
                 .reset_index(level=0, drop=True)
                *0
                )    
ysubmit_roll = my_truncate(ysubmit_roll)

#  get the is_epidemic prediction, for the same index as above
isepipred_roll = isepi_pred.loc[ysubmit_roll.index]

x_submit.shape, xsubmit_roll.loc['sj'].shape, xsubmit_roll.loc['iq'].shape, ysubmit_roll.shape, isepi_pred.shape, isepipred_roll.shape

In [None]:
ysubmit_pred = []
for city in ['sj','iq']:
    print('shapes', isepipred_roll.loc[city].shape, xsubmit_roll.loc[city][:,-1:,0].shape)
    in_1 = {
        'raw_features': xsubmit_roll.loc[city],#[yretrain_roll.loc[city, 'is_epidemic'].astype('bool')], 
        'is_epidemic': isepipred_roll.loc[city, ['is_epidemic']],
        'weekofyear': isepipred_roll.loc[city, ['weekofyear']],
    }
    #[yretrain_roll.loc[city, 'is_epidemic'].astype('bool')].index
    res = my_predict(city, in_1, ysubmit_roll.loc[city].index)
    ysubmit_pred.append(res)

ysubmit_pred = pd.concat(ysubmit_pred, axis=0)

# reverse log10 transform
# y_pred['total_cases'] = ((10**((y_pred['total_cases']).clip(upper=3)))-1).astype(int)

In [None]:
for city in ['sj','iq']:
    (ysubmit_pred.loc[city]['epidemic_max']).plot(figsize=(20,3), label=city)

plt.title('submission')
plt.legend()
plt.show()

In [None]:
for city in ['sj','iq']:
    isepi_pred.loc[city, 'is_epidemic'].plot(figsize=(20,3), label=city)

plt.title('is_epidemic')
plt.legend()
plt.show()

## set in submission

In [None]:
from src.features.build_features import load_raw
df_all = load_raw()

submit = df_all['submission'].copy()
# TODO if this matches indeces properly, review the complicated merge in 3.1
submit['total_cases'] = ysubmit_pred['epidemic_max']
submit = submit.fillna(value=0)
submit['total_cases'] = submit['total_cases'].astype('int')

## plot

In [None]:
for city in ['sj','iq']:
    submit.loc[city, 'total_cases'].plot(label=city, figsize=(20,3))
    
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission

In [None]:
make_submission(submit.reset_index())