# TRADE REPUBLIC DATA SCIENCE CHALLENGE

# PREDICTING TRADES AFTER 6TH MONTH

### NOTES :

We will use Keras in our project to implement LSTM, Long Short-term Memory (LSTM) method


In [236]:
# importing all libarary functions:
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from __future__ import division

import warnings
warnings.filterwarnings("ignore")

#import Keras
import keras
from keras.layers import Dense
from keras.models import Sequential

from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [237]:
trades.head()

Unnamed: 0,date,customer_id,execution_size
0,2020-01-01,1,2.0
1,2020-01-01,2,4.9327
2,2020-01-01,3,98.0
3,2020-01-01,4,0.5458
4,2020-01-01,5,712.0


### Data Wrangling

In [238]:
#represent month in date field as its first day
trades['date'] = pd.to_datetime(trades['date'])
trades['date'] = trades['date'].dt.year.astype('str') + '-' + trades['date'].dt.month.astype('str') + '-01'
trades['date'] = pd.to_datetime(trades['date'])

In [239]:
#groupby date and sum the no of trades(execution_size) mt: monthly trades
mt = trades.groupby(['date']).execution_size.sum().reset_index() #mt: monthly trades
mt.columns=['Month','trades']

In [240]:
mt.head()

Unnamed: 0,date,execution_size
0,2020-01-01,604243.1
1,2020-02-01,1421208.0
2,2020-03-01,2756665.0
3,2020-04-01,2886939.0
4,2020-05-01,3580432.0


In [241]:
#add previous sales to the next row
mt['prev_trade'] = mt['trades'].shift(1)
mt=mt.dropna() #drop the null values and 

In [242]:
#calculate the difference
mt['diff'] = (mt['trades'] - mt['prev_trade'])

In [243]:
df = mt.drop(['prev_trade'],axis=1)

In [244]:
#adding lags
for inc in range(2):
    field_name = 'lag_' + str(inc)
    df[field_name] = df['diff'].shift(inc)

In [245]:
df.head()

Unnamed: 0,date,execution_size,diff,lag_0,lag_1
1,2020-02-01,1421208.0,816964.6,816964.6,
2,2020-03-01,2756665.0,1335457.0,1335457.0,816964.6
3,2020-04-01,2886939.0,130274.3,130274.3,1335457.0
4,2020-05-01,3580432.0,693492.9,693492.9,130274.3
5,2020-06-01,5975960.0,2395528.0,2395528.0,693492.9


In [246]:
#drop null values
df = df.dropna().reset_index(drop=True)

### MACHINE LEARNING MODELS

In [247]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf 

# Define the regression formula
model = smf.ols(formula='diff ~ lag_0', data=df)

# Fit the regression
model_fit = model.fit()

# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

1.0


In [248]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf 

# Define the regression formula
model = smf.ols(formula='diff ~ lag_0 + lag_1', data=df)

# Fit the regression
model_fit = model.fit()

# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

1.0


### VALIDATION:

since the adjusted r squared is 1, let say our model can do very good prediction

In [249]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df.drop(['execution_size','date'],axis=1)

In [250]:
#split train and test set
train_set, test_set = df_model[0:-3].values, df_model[-3:].values

In [251]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   diff    10 non-null     float64
 1   lag_0   10 non-null     float64
 2   lag_1   10 non-null     float64
dtypes: float64(3)
memory usage: 368.0 bytes


In [252]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)

# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [253]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])

In [254]:
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [257]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1, shuffle=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x22d6e0f3280>

In [258]:
y_pred = model.predict(X_test,batch_size=1)

In [259]:
y_pred

array([[-0.12795399],
       [ 0.21654549],
       [-0.03245249]], dtype=float32)

In [260]:
y_test

array([[-0.17154513],
       [ 0.26427303],
       [-0.21333622]])

In [261]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])

In [266]:
#rebuild test set for inverse transform
import numpy as np
pred_test_set = []
for index in range(0,len(y_pred)):
    n=np.concatenate([y_pred[index],X_test[index]],axis=1)
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))

n

array([[-0.03245249, -0.21333622,  0.26427303]])

In [267]:
pred_test_set[0]

array([[-0.12795399, -0.17154513,  0.29029404]])

In [268]:
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])

In [269]:
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

In [271]:
#create dataframe that shows the predicted number of trades
result_list = []
trade_dates = list(mt[-7:].date)
act_trade = list(mt[-7:].execution_size)
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_trade[index])
    result_dict['date'] = trade_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)

In [272]:
df_result

Unnamed: 0,pred_value,date
0,5712380,2020-07-01
1,4205129,2020-08-01
2,2037060,2020-09-01


In [274]:
mt = trades.groupby(['date']).execution_size.sum().reset_index()
mt.head()

Unnamed: 0,date,execution_size
0,2020-01-01,604243.1
1,2020-02-01,1421208.0
2,2020-03-01,2756665.0
3,2020-04-01,2886939.0
4,2020-05-01,3580432.0


In [276]:
#merge with actual sales dataframe
df_sales_pred = pd.merge(mt,df_result,on='date',how='left')

In [277]:
df_sales_pred

Unnamed: 0,date,execution_size,pred_value
0,2020-01-01,604243.1,
1,2020-02-01,1421208.0,
2,2020-03-01,2756665.0,
3,2020-04-01,2886939.0,
4,2020-05-01,3580432.0,
5,2020-06-01,5975960.0,
6,2020-07-01,3656565.0,5712380.0
7,2020-08-01,2075499.0,4205129.0
8,2020-09-01,2797923.0,2037060.0
9,2020-10-01,2431579.0,


In [435]:
#plot actual and predicted
plot_data = [
    go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['execution_size'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]

plot_layout = go.Layout(
        title='Prediction of Number of Trades in Total ',
    xaxis_title='Month',
    yaxis_title='Number of Trades'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
fig.show()

# CONCLUSION

In [None]:
Forecasting the monthly Trades with LSTM
I used Tensorflow and Keras libraries in this part of the Taks project to implement LSTM.

### REFERENCE

1. https://towardsdatascience.com/predicting-sales-611cb5a252de
2. https://machinelearningmastery.com/use-timesteps-lstm-networks-time-series-forecasting/#:~:text=Specifically%2C%20a%20lag%3D1%20differencing,increasing%20trend%20in%20the%20data.&text=Transform%20the%20observations%20to%20have,function%20of%20the%20LSTM%20model.
3. https://statisticsbyjim.com/regression/interpret-adjusted-r-squared-predicted-r-squared-regression/