# Data Preprocessing

In [None]:
''' Loading packages '''
import gc
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import SimpleRNN, LSTM, Dense
from keras.callbacks import EarlyStopping

warnings.filterwarnings('ignore')

In [None]:
''' Loading data '''
FILE_PATH = ".//train_test//"

TRAIN_FILE = "en_train.csv"
TEST_FILE = "en_test.csv"

train_set = pd.read_csv(FILE_PATH + TRAIN_FILE)
test_set = pd.read_csv(FILE_PATH + TEST_FILE)

# Dropping language columns
train_set.drop('lang', axis=1, inplace=True)
test_set.drop('lang', axis=1, inplace=True)

In [None]:
''' Selecting number of days for training '''
NUM_DAYS = 180

date_range = train_set.columns[-NUM_DAYS:].tolist()
date_range.insert(0, 'Page')
train_set = train_set.loc[:, date_range]

In [None]:
''' Replacing null values with median or logmean '''
def logmean(x):
    return np.expm1(np.mean(np.log1p(x)))


def fill_median(): 
    impute_values = np.empty(shape=(train_set.shape[0],))
    data_values = np.array(train_set.iloc[:, 1:])
    MEDIAN = np.nanmedian(data_values.flatten())
    
    for row in range(train_set.shape[0]):
        if np.sum(np.isnan(data_values[row])) == len(data_values[row]):
            impute_values[row] = MEDIAN
        else:
            impute_values[row] = np.nanmedian(data_values[row])
      
    return(impute_values)


def fill_logmean():
    impute_values = np.empty(shape=(train_set.shape[0],))
    data_values = np.array(train_set.iloc[:, 1:])
    LOGMEAN = logmean(data_values.flatten())
    
    for row in range(train_set.shape[0]):
        if np.sum(np.isnan(data_values[row])) == len(data_values[row]):
            impute_values[row] = LOGMEAN
        else:
            data_values[row] = data_values[row][~np.isnan(data_values[row])]
            impute_values[row] = logmean(data_values[row])

    return(impute_values)


# train_set['impute_values'] = fill_median()
train_set['impute_values'] = fill_logmean()

train_set.iloc[:, 1:] = train_set.iloc[:, 1:].apply(lambda x: x.fillna(value=train_set['impute_values']))
train_set.drop('impute_values', axis=1, inplace=True)

In [None]:
''' Applying Min/Max Scaler '''
sc = MinMaxScaler()
sc.fit(train_set.iloc[:, 1:].values)
train_set_sc = pd.DataFrame(sc.transform(train_set.iloc[:, 1:].values))

In [None]:
''' Clean Memory '''
del train_set
gc.collect()

# RNN Model


In [None]:
def rnn_model():
    
    # Model architecture
    model = Sequential()
    model.add(SimpleRNN(units=128, input_dim=179, activation='tanh', dropout=0.2, recurrent_dropout=0.0))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    
    # setup predictions array
    predictions = np.empty(shape=(train_set.shape[0], 60))
    
    # train/test split
    X_train = train_set_sc.iloc[:, 0:179]
    y_train = train_set_sc.iloc[:, 179] # 12/31/16
    X_test = train_set_sc.iloc[:, 1:180]
    
    # predict one day at a time
    for i in range(60):
        
        if i % 5 == 0:
            print('Predicting Day {0}'.format(i))

        # reshape data for neural net
        X_tr = X_train.values.reshape(-1, 1, 179)
        X_te = X_test.values.reshape(-1, 1, 179)
        y_tr = y_train.values.reshape(-1, 1)
        
        # train and predict
        model.fit(X_tr, y_tr, epochs=50, batch_size=1000, verbose=0)
        preds = model.predict(X_te)
        
        # append predictions and extend train/test
        if i == 0: 
            predictions = pd.DataFrame(preds, columns=['pred_0'])
        else: 
            predictions = pd.concat((predictions, pd.DataFrame(preds, columns=['pred_' + str(i)])), axis=1)
        
        X_train = pd.concat((X_train.iloc[:, i+1:], predictions), axis=1)
        X_test = pd.concat((X_test.iloc[:, i+1:], predictions), axis=1)
        y_train = pd.DataFrame(predictions.iloc[:, i], columns=['pred_' + str(i)])
    
    return(predictions)

In [None]:
preds = rnn_model()
print('Done.')

In [None]:
# Pad and inverse transform predictions
pads = pd.DataFrame(np.empty((train_set.shape[0], 120)))
padded_preds = pd.concat((pads, preds), axis=1)
final_preds = sc.inverse_transform(padded_preds)[:, 120: ]

del pads, padded_preds
gc.collect()

In [None]:
# Make sure predictions are in a similar range to training data
print(train_set.iloc[0, :].values[-10:])
print(final_preds[0])

In [None]:
def page_model():
    '''
    RNN that models pages sequentially for all days with time-shifted data; does not perform well
    '''
    
    # Neural Network Architecture
    model = Sequential()
    model.add(LSTM(10, input_shape=(1, 60)))
    model.add(Dense(1, activation='tanh'))
    model.compile(optimizer='adam', loss='mse')
    
    callbacks = [EarlyStopping(monitor='val_loss', patience=5, verbose=0)]
    
    # Collecting pages for looping
    unique_pages = np.unique(train_flattened[['Page']].values)

    # Setting up array for appending predictions
    predictions = np.empty(shape=(train_set.shape[0], 60))
    i = 0
    
    # Looping through unique pages
    for page in unique_pages:
        
        train_page = train_flattened.ix[train_flattened['Page']==page, 2:]
        test_page = test_set.loc[test_set['Page']==page, :]
        
        # Create 60-day shift matrix
        for s in range(1,61):
            train_page['Visits_{}'.format(s)]=train_page['Visits'].shift(s)
        shift_values = train_page.dropna()
    
        # Split train/test data
        X_train = shift_values.drop('Visits', axis=1).values[:-60]
        X_test = shift_values.drop('Visits', axis=1).values[-60:]
        y_train = shift_values[['Visits']].values[:60]

        # Reshape and normalize data for neural network
        X_train = X_train.reshape(-1, 1, 60)
        X_test = X_test.reshape(-1, 1, 60)

        sc = MinMaxScaler(feature_range=(0, 1))
        sc.fit(list(X_train.flatten()) + list(y_train.flatten()) + list(X_test.flatten()))
        X_train = sc.transform(X_train.flatten()).reshape(-1, 1, 60)
        X_test = sc.transform(X_test.flatten()).reshape(-1, 1, 60)
        y_train = sc.fit_transform(y_train)
        
        # Batch training
        model.fit(X_train, y_train, epochs=20, batch_size=20000, verbose=0, validation_data=None, callbacks=callbacks)
    
        # Predict page and append
        preds = model.predict(X_test)
        predictions[i] = preds.flatten()
        
        # Tracking progress
        if i % 1000 == 0:
            print('{0} rows predicted'.format(i))
        i += 1
    
    # Reverse tranform predictions and return output
    predictions = sc.inverse_transform(predictions)    
    return(predictions)