# P1 - Predict incomings in 3 weeks

## Utils

#### Hyperparameters and constants

In [75]:
PAST_TIMESTEPS = 12
FUTURE_TIMESTEPS = 3
FEATURES = 1
EPOCHS = 10
BATCH_SIZE = 100

TRAINSET_FILE = 'dades_train_P1.csv'
TESTSET_FILE = 'dades_test.csv'

#### Helpers

In [76]:
import pandas as pd
from datetime import datetime, timedelta

def preprocess(DATASET_PATH):
    df = pd.read_csv(DATASET_PATH,
                     parse_dates=['Dia_Comanda'],
                     date_parser=lambda d: datetime.strptime(d, '%Y-%m-%d'))
    df = df.drop(df.columns[5:-1], axis=1)
    
    min_date = df['Dia_Comanda'].min()
    df['Setmana'] = df['Dia_Comanda'].map(lambda e: (e.year - min_date.year) * 54 + e.isocalendar()[1])
    df = df.groupby(['Client', pd.Grouper(key='Setmana')])\
           .agg({'Preu_Total': 'sum'})
    return df

In [77]:
df_train = preprocess(TRAINSET_FILE)
print(df_train.head())

df_test = preprocess(TESTSET_FILE)
print(df_test.head())

                Preu_Total
Client Setmana            
70003  84         0.596125
       85        19.020566
       86        15.110581
       87        20.602086
       88        19.295976
                Preu_Total
Client Setmana            
70013  84         7.878390
       85         1.843815
       90         1.999404
       91         3.180924
       92         4.370194


## Preprocessing

## Model Training and Fitting

In [79]:
assert(not df_train.isnull().values.any())
assert(not df_test.isnull().values.any())
print('No NAs in the dataframe')

"""
df_train['Customer'] = df_train.index.levels[0]
df_train['Data'] = df_train.index.levels[1]


df_test['Client'] = df_test.index.levels[0]
df_test['Data'] = df_test.index.levels[1]

from sklearn.preprocessing import LabelEncoder

lenc_client = LabelEncoder()
df_train.Client = lenc_client.fit_transform(df_train.Client)
df_test.Client = lenc_client.transform(df_test.Client)

lenc_client = LabelEncoder()
df_train.Data = lenc_client.fit_transform(df_train.Data)
df_test.Data = lenc_client.transform(df_test.Data)
"""

def series_to_supervised(df, n_in=1, n_out=1, drop_nan=True):
    n_vars = df.shape[1]
    cols, names = [], []
    # Input sequence
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += ['var{}(t-{})'.format(j + 1, i) for j in range(n_vars)]
    # Output sequence
    cols.append(df.shift(0))
    names += ['var{}(t)'.format(j + 1) for j in range(n_vars)]
    for i in range(1, n_out):
        cols.append(df.shift(-i))
        names += ['var{}(t+{})'.format(j + 1, i) for j in range(n_vars)]

    df = pd.concat(cols, axis=1)
    df.columns = names
    if drop_nan:
        df.dropna(inplace=True)
    return df

df_train_ts = series_to_supervised(df_train, PAST_TIMESTEPS, 1)
df_test_ts = series_to_supervised(df_test, PAST_TIMESTEPS, 1)

df_train_ts.head()

# df_test.head()

No NAs in the dataframe


Unnamed: 0_level_0,Unnamed: 1_level_0,var1(t-12),var1(t-11),var1(t-10),var1(t-9),var1(t-8),var1(t-7),var1(t-6),var1(t-5),var1(t-4),var1(t-3),var1(t-2),var1(t-1),var1(t)
Client,Setmana,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
70003,97,0.596125,19.020566,15.110581,20.602086,19.295976,10.538301,42.314158,15.57079,17.308495,21.202385,12.100745,22.801788,28.030999
70003,99,19.020566,15.110581,20.602086,19.295976,10.538301,42.314158,15.57079,17.308495,21.202385,12.100745,22.801788,28.030999,23.496274
70003,100,15.110581,20.602086,19.295976,10.538301,42.314158,15.57079,17.308495,21.202385,12.100745,22.801788,28.030999,23.496274,21.191654
70003,101,20.602086,19.295976,10.538301,42.314158,15.57079,17.308495,21.202385,12.100745,22.801788,28.030999,23.496274,21.191654,12.111475
70003,102,19.295976,10.538301,42.314158,15.57079,17.308495,21.202385,12.100745,22.801788,28.030999,23.496274,21.191654,12.111475,46.02623


In [83]:
def split_dataset(trainset, testset, TIMESTEPS, FEATURES):
    n_obs = TIMESTEPS * FEATURES
    
    values = trainset.values
    x_train = values[:, :n_obs]
    y_train = values[:, -1]

    values = trainset.values
    x_test = values[:, :n_obs]
    y_test = values[:, -1]

    # Reshape into (samples, timesteps, features)
    x_train = x_train.reshape((x_train.shape[0], TIMESTEPS, FEATURES))
    x_test = x_test.reshape((x_test.shape[0], TIMESTEPS, FEATURES))
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = split_dataset(df_train_ts, df_test_ts, PAST_TIMESTEPS, 1)

(30648, 12, 1) (30648,) (30648, 12, 1) (30648,)


## Training

In [87]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

predictor = Sequential()
predictor.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
predictor.add(LSTM(units=50, return_sequences=True))
predictor.add(LSTM(units=50))
predictor.add(Dense(units=1))
predictor.compile(optimizer='adam', loss='mean_squared_error')

In [97]:
history = predictor.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
                        validation_data=(x_test, y_test), verbose=2, shuffle=False)
predictions = predictor.predict(x_test)

from sklearn.metrics import mean_squared_error
from math import sqrt

sqrt(mean_squared_error(y_test, predictions))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


10.839673779011587

## Evaluation

In [99]:
import numpy as np
import matplotlib.pyplot as plt
a = np.zeros([30648, df_train.shape[1]])
b = np.zeros([30648, df_train.shape[1]])
a[:, df_train.shape[1] - 1] = predictions[:, 0]
b[:, df_train.shape[1] - 1] = y_test

predictions_original = sc.inverse_transform(a)
predictions_original.shape
y_test_original = sc.inverse_transform(b)
print(sqrt(mean_squared_error(y_test_original[:, df_train.shape[1] - 1], predictions_original[:, df_train.shape[1] - 1])))
print(y_test_original[:, df_train.shape[1] - 1].mean())

predictions_original[:, df_train.shape[1] - 1]

NameError: name 'sc' is not defined

#### Evaluate RMSE

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

print('RMSE =', sqrt(mean_squared_error(y_test, predictions)))

#### Write results into .csv file formatted as <Client, volum>

In [None]:
df_prediction = pd.DataFrame(columns=['Client', 'volum'])
df_prediction.Client = df_test.index
df_prediction.volum = predictions_original
df_prediction.to_csv('predictions_P1.csv')