# Modelling

## Packages

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, normalize
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import joblib


import tensorflow
from tensorflow.keras import layers
from tensorflow.keras import utils
from tensorflow.keras.models import load_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
#from tensorflow.nn import relu, softmax

from imblearn.over_sampling import SMOTE

from matplotlib import pyplot as plt

print(f" Found and Using {len(tensorflow.config.experimental.list_physical_devices('GPU'))} GPU")

 Found and Using 1 GPU


## Functions

In [2]:
def scale_x_encode_y(x, y):

    #scaler_obj = StandardScaler()
    scaler_obj = MinMaxScaler()
    scaler_obj.fit(x.values)
    x_scaled = scaler_obj.transform(x.values)
    #x_scaled = x.values
    
    encoder = LabelEncoder()
    encoder.fit(y)
    y_encoded = encoder.transform(y)
    y_encoded = utils.to_categorical(y_encoded)
    
    y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
 
    return(x_scaled, y_encoded, y_map, scaler_obj)


def smote_data(x, y):
    oversample = SMOTE()
    smote_x, smote_y = oversample.fit_resample(x, y)

    return(smote_x, smote_y)

def split_data(df, divisor):
    df_tot = df.copy()
    print(f'df_tot count : {len(df_tot)}')

    df1 = df_tot.iloc[int(len(df_tot)/divisor):]
    print(f'df1 count : {len(df1)}')

    df2 = df_tot.iloc[:int(len(df_tot)/divisor)]
    print(f'df2 count : {len(df2)}')
    
    return(df_tot, df1, df2)

def data_prep(df, apply_smote = False):
    x = df.loc[:, df.columns != target_col]
    y = df[target_col]

    if apply_smote:
        x, y = smote_data(x, y)

    print(f'Record count : {len(y)}')
    print('--------------------------')
    print(y.value_counts())
    print('--------------------------')
    print(y.value_counts(normalize=True))
    
    x_scaled, y_encoded, y_map, scaler_obj = scale_x_encode_y(x, y)
    pd.DataFrame(x_scaled).head()
    
    return(x_scaled, y_encoded, y_map, scaler_obj)

## Read Data

In [3]:
remove_cols = ['tick_avg', 'sema', 'ssma', 'lema', 'lsma', 'max_tick', 'min_tick']

In [4]:
df = pd.read_csv('data/tab_tick_2019.csv')
df.drop(remove_cols, axis=1, inplace=True)
df.head(5)

Unnamed: 0,spread_avg,tick_sd,sema_diff,lema_diff,diff,avg_gain,avg_loss,rs,rsi,ssma_diff,lsma_diff,sma_diff,max_gap,min_gap,ema_diff,direction
0,0.000234,1.1e-05,1.063391e-06,-3.599718e-07,-9e-06,7e-06,1e-05,0.726481,42.078708,3e-06,-2e-06,-4.6e-05,3.9e-05,-0.000124,-3.3e-05,same
1,0.000306,1.5e-05,-3.848168e-06,1.37694e-06,-5.1e-05,7e-06,1.1e-05,0.686985,40.722656,4e-06,-5e-06,-3.7e-05,9e-05,-4.7e-05,-3.8e-05,same
2,0.000335,7e-06,-5.083489e-06,3.357754e-06,-1.1e-05,7e-06,1.2e-05,0.597542,37.403846,2e-06,-3e-06,-3.3e-05,0.000101,-3.2e-05,-4.6e-05,same
3,0.000254,2.8e-05,8.506514e-07,-1.952792e-06,5e-05,1e-05,1.2e-05,0.88172,46.857143,4e-06,-8e-06,-2.1e-05,5.2e-05,-5e-05,-4.3e-05,same
4,0.000307,2.8e-05,-2.52014e-06,-5.857545e-06,-1.5e-05,1e-05,9e-06,1.205882,54.666667,-2e-06,-8e-06,-1.5e-05,6.7e-05,-3.5e-05,-4e-05,same


In [83]:
target_col = 'direction'
x_scaled, y_encoded, y_map, scaler_obj = data_prep(df, apply_smote = False)
joblib.dump(scaler_obj, 'data/model/scaler_obj.pkl') 
joblib.dump(y_map, 'data/model/y_map.pkl') 

Record count : 1459266
--------------------------
same        946807
decrease    256665
increase    255794
Name: direction, dtype: int64
--------------------------
same        0.648824
decrease    0.175886
increase    0.175289
Name: direction, dtype: float64


['data/model/y_map.pkl']

## Neural Network

### Network building

In [84]:
train_x, valid_x, train_y, valid_y = train_test_split(x_scaled, y_encoded,train_size = 0.8,random_state = 1)

In [120]:
model = Sequential([
    Dense(units = 2048, activation='relu', input_shape=(len(train_x[0]),)),
    Dense(units = 1024, activation='relu',),
    Dense(units = 512, activation='relu'),    
    Dense(units = 256, activation='relu'),
    Dense(units = 128, activation='relu'),
    Dense(units = 3, activation='softmax')
])

model.compile(    
    loss=CategoricalCrossentropy(),
    optimizer=Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.01),
    metrics=['accuracy'])

model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_117 (Dense)            (None, 2048)              32768     
_________________________________________________________________
dense_118 (Dense)            (None, 1024)              2098176   
_________________________________________________________________
dense_119 (Dense)            (None, 512)               524800    
_________________________________________________________________
dense_120 (Dense)            (None, 256)               131328    
_________________________________________________________________
dense_121 (Dense)            (None, 128)               32896     
_________________________________________________________________
dense_122 (Dense)            (None, 3)                 387       
Total params: 2,820,355
Trainable params: 2,820,355
Non-trainable params: 0
___________________________________________

### Training

In [121]:
epoch_val         = 10
batch_size_val    = 1024

verbose_val       = 2
workers_val       = -1

In [122]:
model.fit(x=train_x, 
          y=train_y, 
          epochs=epoch_val,
          batch_size = batch_size_val,
          validation_data = (valid_x, valid_y),
          workers= workers_val,
          verbose=verbose_val)

Epoch 1/10
1141/1141 - 9s - loss: 0.4863 - accuracy: 0.7892 - val_loss: 0.4104 - val_accuracy: 0.8244
Epoch 2/10
1141/1141 - 8s - loss: 0.3935 - accuracy: 0.8328 - val_loss: 0.3852 - val_accuracy: 0.8364
Epoch 3/10
1141/1141 - 9s - loss: 0.3807 - accuracy: 0.8383 - val_loss: 0.3785 - val_accuracy: 0.8396
Epoch 4/10
1141/1141 - 8s - loss: 0.3760 - accuracy: 0.8404 - val_loss: 0.3750 - val_accuracy: 0.8407
Epoch 5/10
1141/1141 - 8s - loss: 0.3730 - accuracy: 0.8417 - val_loss: 0.3727 - val_accuracy: 0.8419
Epoch 6/10
1141/1141 - 9s - loss: 0.3709 - accuracy: 0.8426 - val_loss: 0.3706 - val_accuracy: 0.8427
Epoch 7/10
1141/1141 - 9s - loss: 0.3691 - accuracy: 0.8433 - val_loss: 0.3690 - val_accuracy: 0.8433
Epoch 8/10
1141/1141 - 9s - loss: 0.3677 - accuracy: 0.8440 - val_loss: 0.3677 - val_accuracy: 0.8439
Epoch 9/10
1141/1141 - 9s - loss: 0.3665 - accuracy: 0.8444 - val_loss: 0.3665 - val_accuracy: 0.8444
Epoch 10/10
1141/1141 - 8s - loss: 0.3654 - accuracy: 0.8450 - val_loss: 0.3656 - 

<tensorflow.python.keras.callbacks.History at 0x25615834e10>

### DNN

In [123]:
predictions = model.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

              precision    recall  f1-score   support

    decrease       0.81      0.71      0.76     51406
    increase       0.81      0.73      0.77     51209
        same       0.86      0.91      0.88    189239

    accuracy                           0.84    291854
   macro avg       0.83      0.79      0.81    291854
weighted avg       0.84      0.84      0.84    291854



In [124]:
model.save('data/model/model.h5') 
model_new = load_model('data/model/model.h5')

### DNN Prediction

In [125]:
live_df = pd.read_csv('data/live_preds.csv')
live_df.drop(remove_cols, axis=1, inplace=True)
del live_df['predicted_direction']

model_new = load_model('data/model/model.h5')    
scaler_obj = joblib.load('data/model/scaler_obj.pkl')
y_map = joblib.load('data/model/y_map.pkl') 

x_scaled = scaler_obj.transform(live_df.values)
predictions = model_new.predict(x_scaled) 
rounded_predictions = np.argmax(predictions, axis = -1)

#live_df['predictions'] = y_map[data['rounded_predictions'][0]]
live_df['predictions'] = [y_map[k] for k in rounded_predictions]

print(y_map)
live_df['predictions'].value_counts()

{0: 'decrease', 1: 'increase', 2: 'same'}


increase    283
Name: predictions, dtype: int64

In [None]:
df1 = pd.read_csv('data/tab_tick_2018.csv')
df1.drop(remove_cols, axis=1, inplace=True)

x_scaled1, y_encoded1, y_map1, scaler_obj_1 = data_prep(df1, apply_smote = False)

predictions = model_new.predict(x_scaled1)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(y_encoded1, axis = -1)

df1['predictions'] = [y_map1[k] for k in rounded_predictions]
df1.to_csv('data/tab_tick_2018_res.csv')

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map1.values()))

fig, ax = plt.subplots(figsize=(8, 8))
cm = confusion_matrix(rounded_valid_y, rounded_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_map1.values())
disp.plot(ax=ax)

## RF Results

In [None]:
%%time
clf = RandomForestClassifier(n_jobs=-1) 
clf.fit(train_x, train_y)
predictions = clf.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)
print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

In [None]:
df1 = pd.read_csv('data/tab_tick_2018.csv')
df1.drop(remove_cols, axis=1, inplace=True)

x_scaled1, y_encoded1, y_map1, scaler_obj_1 = data_prep(df1, apply_smote = False)

predictions = clf.predict(x_scaled1)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(y_encoded1, axis = -1)

df1['predictions'] = [y_map1[k] for k in rounded_predictions]
df1.to_csv('data/tab_tick_2018_res.csv')

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map1.values()))

fig, ax = plt.subplots(figsize=(8, 8))
cm = confusion_matrix(rounded_valid_y, rounded_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_map1.values())
disp.plot(ax=ax)

In [None]:
x_cols = df.columns[df.columns != 'direction']
fig = plt.figure()
fig.suptitle('Feature importance', fontsize=20)
plt.xlabel('Importance %', fontsize=16)
plt.ylabel('Features', fontsize=16)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=15)


feat_importances = pd.Series(clf.feature_importances_ * 100, index=x_cols)
feat_importances.nlargest(30).plot(kind='barh', figsize=(15, 10), rot=45)

for index, value in enumerate(np.round(-np.sort(-clf.feature_importances_ * 100))):
    plt.text(value+0.45, index, str(value), fontsize=12)

In [None]:
live_df = pd.read_csv('data/live_preds.csv')
live_df.drop(remove_cols, axis=1, inplace=True)
del live_df['predicted_direction']

scaler_obj = joblib.load('data/model/scaler_obj.pkl')
y_map = joblib.load('data/model/y_map.pkl') 
x_scaled = scaler_obj.transform(live_df.values)

predictions = clf.predict(x_scaled)
rounded_predictions = np.argmax(predictions, axis = -1)

#live_df['predictions'] = y_map[data['rounded_predictions'][0]]
live_df['predictions'] = [y_map[k] for k in rounded_predictions]

print(y_map)
live_df['predictions'].value_counts()