# Data Preprocessing

In [1]:
''' Loading packages '''
import gc
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import optimizers, losses

warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [3]:
''' Loading data '''
FILE_PATH = ".//train_test//"
FILE_PATH = "C:/Users/Rishabh Khurana/Documents/Kaggle/Web traffic time series/"

TRAIN_FILE = "en_train.csv"
TEST_FILE = "en_test.csv"

train_set = pd.read_csv(FILE_PATH + TRAIN_FILE)
test_set = pd.read_csv(FILE_PATH + TEST_FILE)

# Dropping language columns
train_set.drop('lang', axis=1, inplace=True)
test_set.drop('lang', axis=1, inplace=True)


In [4]:
''' Selecting number of days for training '''
NUM_DAYS = 180

date_range = train_set.columns[-NUM_DAYS:].tolist()
date_range.insert(0, 'Page')
train_set = train_set.loc[:, date_range]


In [6]:
''' Replacing null values with median or logmean '''
def logmean(x):
    return np.expm1(np.mean(np.log1p(x)))


def fill_median(): 
    impute_values = np.empty(shape=(train_set.shape[0],))
    data_values = np.array(train_set.iloc[:, 1:])
    MEDIAN = round(np.nanmedian(data_values.flatten()),0)
    
    for row in range(train_set.shape[0]):
        if np.sum(np.isnan(data_values[row])) == len(data_values[row]):
            impute_values[row] = MEDIAN
        else:
            impute_values[row] = np.nanmedian(data_values[row])
      
    return(impute_values)


def fill_logmean():
    impute_values = np.empty(shape=(train_set.shape[0],))
    data_values = np.array(train_set.iloc[:, 1:])
    LOGMEAN = logmean(data_values.flatten())
    
    for row in range(train_set.shape[0]):
        if np.sum(np.isnan(data_values[row])) == len(data_values[row]):
            impute_values[row] = LOGMEAN
        else:
            data_values[row] = data_values[row][~np.isnan(data_values[row])]
            impute_values[row] = logmean(data_values[row])

    return(impute_values)


train_set['impute_values'] = fill_median()
# train_set['impute_values'] = fill_logmean()

train_set.iloc[:, 1:] = train_set.iloc[:, 1:].apply(lambda x: x.fillna(value=train_set['impute_values']))
train_set.drop('impute_values', axis=1, inplace=True)


In [7]:
''' Flattening training data and formatting dates '''
# transforming date rows into column
train_flattened = pd.melt(train_set, id_vars='Page', var_name='date', value_name='Visits')

# converting date object to date time
train_flattened['date']=pd.to_datetime(train_flattened['date'])
test_set['date']=pd.to_datetime(test_set['date'])


In [8]:
''' Clean Memory '''
# del train_set
# del train_flattened
gc.collect()


55

In [9]:
train_set

Unnamed: 0,Page,2016-07-05,2016-07-06,2016-07-07,2016-07-08,2016-07-09,2016-07-10,2016-07-11,2016-07-12,2016-07-13,...,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2,2.0,4.0,2.0,7.0,2.0,3.0,1.0,2.0,2.0,...,3.0,1.0,6.0,3.0,1.0,1.0,3.0,3.0,1.0,0.0
1,5,3726.5,3726.5,3726.5,3726.5,3726.5,3726.5,3726.5,3726.5,3726.5,...,2898.0,2676.0,2032.0,2294.0,2074.0,2175.0,2156.0,2302.0,2098.0,1770.0
2,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
3,18,1730.0,1596.0,1638.0,1606.0,2334.0,1396.0,1620.0,2449.0,1585.0,...,1401.0,1238.0,976.0,810.0,989.0,1332.0,1243.0,1681.0,1303.0,1098.0
4,107,1495.0,1131.0,1160.0,1779.0,1166.0,1155.0,1574.0,1283.0,1141.0,...,404.0,344.0,225.0,229.0,335.0,313.0,379.0,393.0,354.0,272.0
5,147,8876.0,6810.0,5668.0,6031.0,6784.0,7065.0,8267.0,6081.0,5485.0,...,3266.0,3172.0,2772.0,3632.0,3896.0,4504.0,4726.0,4241.0,4325.0,3868.0
6,158,1107.0,1107.0,1094.0,934.0,314.0,313.0,1033.0,1360.0,1185.0,...,1004.0,613.0,289.0,264.0,439.0,687.0,713.0,701.0,510.0,298.0
7,324,8367.0,5645.0,4835.0,4506.0,4703.0,4905.0,5367.0,4409.0,4180.0,...,1809.0,1706.0,1484.0,1530.0,1742.0,2263.0,2205.0,2041.0,1976.0,2148.0
8,377,329.0,23.0,18.0,47.0,25.0,20.0,19.0,16.0,21.0,...,17.0,5.0,19.0,11.0,16.0,10.0,27.0,15.0,26.0,28.0
9,429,1163.0,789.0,889.0,637.0,513.0,612.0,1376.0,991.0,867.0,...,290.0,814.0,757.0,190.0,273.0,252.0,255.0,286.0,271.0,225.0


# Rishabh's Page Model

In [12]:
### TEST CODE SECTION FOR PAGE MODEL



In [None]:
def page_model():
    '''
    RNN that models pages sequentially for all days with time-shifted data 
    '''
    
    # Neural Network Architecture
    model = Sequential()
    model.add(LSTM(10, input_shape=(1, 60)))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    # callbacks = [EarlyStopping(monitor='val_loss', patience=5, verbose=0)]
    # ModelCheckpoint(weights_path, monitor='val_loss', save_best_only=True, verbose=0)
    
    # Collecting pages for looping
    unique_pages = np.unique( train_flattened[['Page']].values)

    # Setting up array for appending predictions
    predictions = np.empty(shape=(train_set.shape[0], 60))
    i = 0
    
    # Looping through unique pages
    for page in unique_pages:
        
        train_page = train_flattened.ix[train_flattened['Page']==page, 2:]
        test_page = test_set.loc[test_set['Page']==page, :]
        
        # Create 60-day shift matrix
        for s in range(1,61):
            train_page['Visits_{}'.format(s)]=train_page['Visits'].shift(s)
        shift_values = train_page.dropna()
    
        # Split train/test data
        X_train = shift_values.drop('Visits', axis=1).values[:-60]
        X_test = shift_values.drop('Visits', axis=1).values[-60:]
        y_train = shift_values[['Visits']].values[:60]

        # Reshape and normalize data for neural network
        X_train = X_train.reshape(-1, 1, 60)
        X_test = X_test.reshape(-1, 1, 60)

        sc = MinMaxScaler(feature_range=(0, 1))
        sc.fit(list(X_train.flatten()) + list(y_train.flatten()) + list(X_test.flatten()))
        X_train = sc.transform(X_train.flatten()).reshape(-1, 1, 60)
        X_test = sc.transform(X_test.flatten()).reshape(-1, 1, 60)
        y_train = sc.fit_transform(y_train)
        
        # Batch training
        model.fit(X_train, y_train, epochs=100, batch_size=20000, verbose=0, validation_data=None) #, callbacks=callbacks)
    
        # Predict page and append
        preds = model.predict(X_test)
        predictions[i] = preds.flatten()
        
        # Tracking progress
        if i % 1000 == 0:
            print('{0} rows predicted'.format(i))
        i += 1
    
    # Reverse tranform predictions and return output
    predictions = sc.inverse_transform(predictions)    
    return(predictions)


In [None]:
''' Predictions Out '''
preds = page_model()

# 10 epochs performs better than 100 epochs
# With linear activation and 10, there are still several negative predictions and high variability between predictions
# Also predictions are way off from training data (see below)
# I think next steps would be looking into regularization, more in-between dense layers, and/or dropout layers
# Would also like to implement EarlyStopping callback with validation data for better tuning


0 rows predicted


In [35]:
# Make sure there are no negative predictions
print(sum(np.array([min(x) for x in preds]) < 0))


2487


In [36]:
# Make sure predictions are in a similar range to training data
print(train_flattened.loc[train_flattened.Page==0, 'Visits'].values[-10:])
print(preds[0][:10])

[ 3.  1.  6.  3.  1.  1.  4.  3.  1.  1.]
[ 2079.94702381  3592.70811161  2784.32664022   383.15810477  5083.79407847
  2035.33184704  1954.99803695  1705.15940081  2892.91956478  1087.1644114 ]


In [None]:
''' Write Outfile '''
test_pages = train_flattened[['Page']].values[-(train_flattened.shape[0]//3):]
test_dates = train_flattened[['date']].values[-(train_flattened.shape[0]//3):]
preds = preds.flatten().reshape(-1, 1) ### MAY NEED TO TRIM FLOAT DECIMALS TO CUT DOWN MEMORY

merge_df = pd.DataFrame() ### WILL COMPLETE LATER, ONCE PREDICTIONS ARE REASONABLE
test = test.merge(merge_df, how='left')

OUTPATH = ""
OUTFILE = ""
test[['Id','Visits']].to_csv(OUTPATH + OUTFILE, index=False)


# Sterling's Day Model -- NOT FINISHED


In [None]:
### TEST CODE SECTION FOR DAY MODEL
# DO I NEED THE WINDOW 1 STRIDE?? -- THIS REQUIRES CHANGE TO CONTENATION OF PREDICTIONS

In [None]:
def day_model():
    '''
    RNN that does not use time-shifted data and models all pages at once, but one day at a time
    '''
    
    # Neural Network Architecture
    model = Sequential()
    model.add(LSTM(10, input_shape=(1, 60)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')

    # callbacks = [EarlyStopping(monitor='val_loss', patience=5, verbose=0)]
    # ModelCheckpoint(weights_path, monitor='val_loss', save_best_only=True, verbose=0)
    
    # Split train/test data
    X_train = np.array(train_set.iloc[:, :60].values)
    y_train = np.array(train_set.iloc[:, 60:120].values)
    X_test = np.array(train_set.iloc[:, -60:].values)
    
    # Reshape and normalize data for neural net
    X_train = X_train.reshape(-1, 1, 60)
    X_test = X_test.reshape(-1, 1, 60)

    sc = MinMaxScaler(feature_range=(0, 1))
    sc.fit(list(X_train.flatten()) + list(y_train.flatten()) + list(X_test.flatten()))
    X_train = sc.transform(X_train.flatten()).reshape(-1, 1, 60)
    X_test = sc.transform(X_test.flatten()).reshape(-1, 1, 60)
    y_train = sc.fit_transform(y_train)
        
    # Setting up array for appending predictions
    predictions = np.empty(shape=(train_set.shape[0], 60))
    i = 0
    
    # Batch training/prediction one day at a time
    for day in range(60):

        # Select days to train/predict
        X_tr = X_train[:, -60:]
        y_tr = y_train[:, day]
        model.fit(X_tr, y_tr, epochs=1, batch_size=20000, verbose=0, validation_data=None) #, callbacks=callbacks)
    
        # Predict day and append
        preds = model.predict(X_test)
        predictions[:, i] = preds.flatten()
    
        print(X_train.shape, preds.reshape(-1, 1, 1).shape)
        print(y_train.shape, preds.flatten().shape)
        X_train = np.hstack((X_train, preds.reshape(-1, 1, 1)))
        y_train = np.hstack((y_train, preds.reshape(-1, 1)))
        
        # Tracking progress
        if i % 1 == 0:
            print('{0} days predicted'.format(i))
        i += 1
    
    # Reverse tranform predictions and return output
    predictions = sc.inverse_transform(predictions)
    return(predictions)


In [None]:
preds = day_model()


In [None]:
''' Predictions Out '''
pages = train_set[['Page']].values
dates = train_set.columns[1:]
preds = day_model()

out_df = pd.DataFrame() ### TO COMPLETE AFTER FIRST RUN
merge_df = pd.melt(out_df, id_vars='Page', var_name='date', value_name='Visits')
test = test.merge(merge_df, how='left')

OUTPATH = ".//subs//"
OUTFILE = "sub_XX.csv"
test[['Id','Visits']].to_csv("OUTPATH + OUTFILE, index=False)
