Pipeline
- deseason features (from statsmodels.tsa and notebook 3.3.1)
- [lasso linear](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)
  - good at filtering out features when there are many of them
- either lasso directly on target, or just use lasso for feature reduction and use another model like RF or OLS

TODO
- correlation matrix like in the [benchmark](https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb)

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [None]:
from src.features.build_features import load_raw

df_all = load_raw()

# replace with 0.2 output
# df_all['labels_train'] = pd.read_pickle('data/processed/is_epidemic.pkl')

df_all.keys()

In [None]:
df_all['features_train'].head(n=2)

In [None]:
df_all['labels_train'].head(n=2)

## fillna

In [None]:
for k in ['features_train', 'features_test']:
    df_all[k] = df_all[k].groupby('city').apply(lambda group: group.fillna(method='ffill'))
    assert ~(pd.isnull(df_all[k]).any().any())
    print(df_all[k].shape)

## choose features

In [None]:
# features selected from
# https://github.com/drivendata/benchmarks/blob/master/dengue-benchmark-statsmodels.ipynb
#selected_features = ['reanalysis_specific_humidity_g_per_kg', 
#                 'reanalysis_dew_point_temp_k', 
#                 'station_avg_temp_c', 
#                 'station_min_temp_c']

# all features
# selected_features = df_all['features_train'].columns

# without year and weekofyear
selected_features = np.array(list(set(df_all['features_train'].columns) - set(['year', 'weekofyear'])))

# check no missing
# assert len(set(selected_features) - set(df_all['features_train'].columns))==0

#################################

# all original/trend/seasonal features
# selected_features = df_train.columns

# only trend + weekofyear
# import numpy as np
# selected_features = np.array([x for x in df_train.columns if x.endswith('_trend')])# or x=='weekofyear'])

#################

selected_features

## plot

In [None]:
df_train = df_all['features_train']

for c in selected_features:
    for city in ['sj','iq']:
        df_train[c].loc[city].plot(figsize=(20,3), label=city)
    plt.legend()
    plt.title(c)
    plt.show()

## train/test split

In [None]:
# split per city
x_train = (df_all['features_train']
           #df_train
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          [selected_features]
          )
x_test = (df_all['features_train']
          #df_train
          .groupby(level='city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          [selected_features]
         )
y_train = ( df_all['labels_train']
            #df_all['labels_train'].loc[df_train.index]
          .groupby('city', as_index=False)
          .apply(lambda group: group.head(n=group.shape[0]*3//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
          # ['is_epidemic'].astype('int')
         )
y_test = ( df_all['labels_train']
            #df_all['labels_train'].loc[df_train.index]
          .groupby('city', as_index=False)
          .apply(lambda group: group.tail(n=group.shape[0]*1//4))
          .reset_index(level=0, drop=True)
          ['total_cases']
          # ['is_epidemic'].astype('int')
         )

# y_train = np.log10(y_train+1)
# y_test = np.log10(y_test+1)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.groupby('city').head(n=2)

In [None]:
x_test.groupby('city').head(n=2)

In [None]:
set(y_train.reset_index()['city'])

In [None]:
y_train.groupby('city').describe()#tail(n=15)

## define custom model for deseasoning

In [None]:
from sklearn.base import BaseEstimator
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.utils import check_array


class DeSeason(BaseEstimator):
    def __init__(self, freq):
        self.freq = freq
        
    def fit(self):
        pass
    
    def transform(self, df_in):
        return self.fit_transform(df_in)

    def fit_transform(self, X, y=None):
        X = check_array(X)
        df_interim = []
        for jjj in range(X.shape[1]):
            res0 = X[:,jjj]
            res1 = res0 - res0.mean(axis=0)
            res2 = seasonal_decompose(res1, freq=self.freq, two_sided=False)
            res2 = pd.DataFrame({
                #'original': res0,
                'trend': res2.trend, 
                # FIXME # 'seasonal': res2.seasonal, 
                'resid': res2.resid
            })

            # FIXME # res2['original'] = res0
            res2 = res2.rename(columns={
                'original': "%s_original"%jjj,
                'trend': "%s_trend"%jjj,
                'seasonal': "%s_seasonal"%jjj,
                'resid': "%s_resid"%jjj,
            })
            df_interim.append(res2)

        return pd.concat(df_interim, axis=1).fillna(value=0)#.dropna(how='all', axis=0)
    
# test
mdl = DeSeason(freq=2)
df_in = np.array([
    [1.0,2.0,3.0],[4.0,5.0,6.0],
    [1.1,2.0,3.0],[4.1,5.0,6.0],
    [1.2,2.0,3.0],[4.2,5.0,6.0],
    [1.3,2.0,3.0],[4.3,5.0,6.0],
])
df_out = mdl.fit_transform(df_in)
df_out

## fit


In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

#import statsmodels.api as sm
#import statsmodels.formula.api as smf

In [None]:
def create_model(alpha):
    # return RandomForestRegressor(n_estimators=100, min_samples_split=5, min_samples_leaf=3)
    # return RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=3)
    # return Lasso(alpha=1., normalize=True)
    m0 = DeSeason(freq=52)
    
    # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler
    m1 = MinMaxScaler()
    
    m2 = PolynomialFeatures() # degree=2
    
    m31 = Lasso(alpha=alpha, normalize=False, positive=True)
    
    # http://scikit-learn.org/stable/modules/feature_selection.html#l1-based-feature-selection
    m32 = SelectFromModel(m31, prefit=False)
    m33 = RandomForestRegressor(n_estimators=100, min_samples_split=5, min_samples_leaf=3)
    # m33 = FunctionTransformer(lambda X: np.log10(X+1))
    # m33 = LinearRegression()
    
    model = Pipeline([
        ('deseason', m0),
        ('scaler', m1),
        ('poly', m2),
        ('reducer', m32),
        ('regressor', m33),
    ])
    # model.set_params(anova__k=10, svc__C=.1).fit(X, y)
    return model


mod1_sj = create_model(alpha=1.)
mod1_sj.fit(X = x_train.loc['sj'], y = y_train.loc['sj'])
mod1_iq = create_model(alpha=.1)
mod1_iq.fit(X = x_train.loc['iq'], y = y_train.loc['iq'])

## check feature importances

In [None]:
len(selected_features), len(mod1_sj.named_steps['regressor'].coef_), len(mod1_iq.named_steps['regressor'].coef_)

In [None]:
df_coef = pd.DataFrame({
    #'f': selected_features, # 
    'f': mod1_sj.named_steps['poly'].get_feature_names(), # with polynomial
    'sj1': mod1_sj.named_steps['regressor'].coef_, # lasso
    # 'sj1': mod1_sj.named_steps['regressor'].feature_importances_, # RF
    #'sj2': abs(mod1_sj.named_steps['regressor'].coef_),
    'iq1': mod1_iq.named_steps['regressor'].coef_, # lasso
    # 'iq1': mod1_iq.named_steps['regressor'].feature_importances_, # RF
    #'iq2': abs(mod1_iq.named_steps['regressor'].coef_),
}).set_index('f')
# .sort_values('sj2', ascending=False)
df_coef[(abs(df_coef['iq1'])>.1) | (abs(df_coef['sj1'])>.1)] # lasso
# df_coef[(abs(df_coef['iq1'])>.02) | (abs(df_coef['sj1'])>.02)] # RF
# df_coef

In [None]:
selected_features[[1,10]]

## predict on train to visualize

In [None]:
#  alpha = 1

# cast to int since we know the label is integer
predictions = (y_train.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_train.loc['sj'])
predictions.loc['iq'] = mod1_iq.predict(x_train.loc['iq'])
#predictions = 10**predictions.astype('int')

for city in ['sj', 'iq']:
    plt.plot(y_train.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## predict on test set

In [None]:
# cast to int since we know the label is integer
predictions = (y_test.copy()*0).astype('int')

predictions.loc['sj'] = mod1_sj.predict(x_test.loc['sj'])#.astype(int)
predictions.loc['iq'] = mod1_iq.predict(x_test.loc['iq'])#.astype(int)

#predictions = (10**predictions).astype('int')
predictions.loc['sj'].head()

In [None]:
'sj', mod1_sj.score(x_test.loc['sj'], y_test.loc['sj']), 'iq', mod1_iq.score(x_test.loc['iq'], y_test.loc['iq'])

## plot

In [None]:
for city in ['sj', 'iq']:
    plt.plot(y_test.loc[city], label='actual')
    plt.plot(predictions.loc[city], label='predicted')
    plt.title(city)
    plt.legend()
    plt.show()

## re-fit on complete dataset

In [None]:
df_test = df_all['features_test']

df_test.shape, df_all['labels_train']['total_cases'].shape

In [None]:
'weekofyear' in df_test.columns, 'weekofyear' in df_train.columns

In [None]:
# note avoiding class bias
x_retrain = df_all['features_train'][selected_features] # df_train[selected_features]
#y_retrain = ( df_all['labels_train']
#          .groupby('city', as_index=False)
#          .apply(lambda group: group.tail(n=group.shape[0]-52))
#          .reset_index(level=0, drop=True)
#         )['total_cases']
y_retrain = df_all['labels_train']['total_cases']
# y_retrain = np.log10(y_retrain+1)

mod1_sj = create_model(alpha=1)
mod1_sj.fit(X = x_retrain.loc['sj'], y = y_retrain.loc['sj'])
mod1_iq = create_model(alpha=.1)
mod1_iq.fit(X = x_retrain.loc['iq'], y = y_retrain.loc['iq'])

## set in submission

In [None]:
df_all['submission'].shape, df_train.shape, df_test.shape

In [None]:
# cast to int since we know the label is integer
predictions = (df_all['submission'][['total_cases']]*0).astype('int')

p1 = mod1_sj.predict(df_test.loc['sj', selected_features])#.astype(int)
p1 = pd.DataFrame({'pred': p1, 'city': 'sj', 'week_start_date': df_test.loc['sj'].index}).set_index(['city', 'week_start_date'])
p2 = mod1_iq.predict(df_test.loc['iq', selected_features])#.astype(int)
p2 = pd.DataFrame({'pred': p2, 'city': 'iq', 'week_start_date': df_test.loc['iq'].index}).set_index(['city', 'week_start_date'])

p3 = pd.concat([p1,p2], axis=0)
predictions = predictions.merge(p3, left_index=True, right_index=True, how='left').fillna(value=0)
# predictions['pred'] = 10**predictions['pred'].astype('int')
predictions['total_cases'] = predictions['pred']
del predictions['pred']

In [None]:
predictions.head(n=60).tail(n=5)

In [None]:
submit = df_all['submission'].copy()
# TODO Will this match indeces properly?
# submit['total_cases'] = predictions

del submit['total_cases']

submit = submit.merge(
    predictions,
    left_index=True,
    right_index=True,
    how='left'
)
submit['total_cases'] = submit['total_cases'].fillna(value=0)
#submit['total_cases'] = (10**submit['total_cases']).astype('int')

In [None]:
submit.shape

In [None]:
submit.groupby('city').head(n=2)

## plot

In [None]:
for city in ['sj','iq']:
    submit.loc[city, 'total_cases'].plot(figsize=(20,3), label=city)
        
plt.legend()
plt.show()

## generate submission file

In [None]:
from src.features.build_features import make_submission