# Modelling

## Packages

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, normalize
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import joblib

import xgboost

import tensorflow
from tensorflow.keras import layers
from tensorflow.keras import utils
from tensorflow.keras.models import load_model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy, BinaryAccuracy
#from tensorflow.nn import relu, softmax

from imblearn.over_sampling import SMOTE

from matplotlib import pyplot as plt

print(f" Found and Using {len(tensorflow.config.experimental.list_physical_devices('GPU'))} GPU")

 Found and Using 1 GPU


## Functions

In [2]:
def scale_x_encode_y(x, y):

    scaler_obj = StandardScaler()
    #scaler_obj = MinMaxScaler()
    scaler_obj.fit(x.values)
    x_scaled = scaler_obj.transform(x.values)
    #x_scaled = x.values
    
    encoder = LabelEncoder()
    encoder.fit(y)
    y_encoded = encoder.transform(y)
    y_encoded = utils.to_categorical(y_encoded)
    
    y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
 
    return(x_scaled, y_encoded, y_map, scaler_obj)


def smote_data(x, y):
    oversample = SMOTE()
    smote_x, smote_y = oversample.fit_resample(x, y)

    return(smote_x, smote_y)

def split_data(df, divisor):
    df_tot = df.copy()
    print(f'df_tot count : {len(df_tot)}')

    df1 = df_tot.iloc[int(len(df_tot)/divisor):]
    print(f'df1 count : {len(df1)}')

    df2 = df_tot.iloc[:int(len(df_tot)/divisor)]
    print(f'df2 count : {len(df2)}')
    
    return(df_tot, df1, df2)

def data_prep(df, apply_smote = False):
    x = df.loc[:, df.columns != target_col]
    y = df[target_col]

    if apply_smote:
        x, y = smote_data(x, y)

    print(f'Record count : {len(y)}')
    print('--------------------------')
    print(y.value_counts())
    print('--------------------------')
    print(y.value_counts(normalize=True))
    
    x_scaled, y_encoded, y_map, scaler_obj = scale_x_encode_y(x, y)
    pd.DataFrame(x_scaled).head()
    
    return(x_scaled, y_encoded, y_map, scaler_obj)

## Read Data

In [3]:
#remove_cols = ['tick_avg', 'sema', 'ssma', 'lema', 'lsma', 'max_tick', 'min_tick', 'small_sema_slope', 'long_sema_slope']
remove_cols = ['tick_act','tick_avg', 'sema', 'ssma', 'lema', 'lsma', 'max_tick', 'min_tick', 'rs']

In [4]:
df = pd.read_csv('data/yearly_tick_data/tab_2020.csv')
df.drop(remove_cols, axis=1, inplace=True)
df.head(5)

Unnamed: 0,weekday,hour,spread_avg,tick_sd,candle_height,sema_diff,lema_diff,top_diff,bottom_diff,diff,...,sma_diff,max_gap,min_gap,ema_diff,small_sema_slope,long_sema_slope,slope_diff,overall_dir,dir_val,direction
0,3,1,3e-05,8e-05,0.00026,7e-05,3e-05,2e-05,0.00024,0.00016,...,-7e-05,0.0,-0.00029,-2e-05,-73.63685,21.70225,-95.3391,1,0,same
1,3,1,2e-05,6e-05,0.00025,0.00011,6e-05,1e-05,0.00025,0.00018,...,-7e-05,0.0,-0.00047,3e-05,67.66099,-23.89354,91.55453,1,0,increase
2,3,1,3e-05,5e-05,0.00024,0.00017,9e-05,3e-05,0.00021,0.00021,...,3e-05,0.0,-0.00068,0.00011,83.36255,12.4988,70.86374,2,1,increase
3,3,1,2e-05,4e-05,0.00013,0.00013,8e-05,7e-05,6e-05,6e-05,...,0.00014,0.0,-0.00061,0.00016,85.4042,62.21494,23.18926,3,2,same
4,3,1,2e-05,2e-05,0.0001,7e-05,6e-05,6e-05,4e-05,-3e-05,...,0.00023,3e-05,-0.00042,0.00017,85.51911,76.68733,8.83178,3,0,same


In [5]:
target_col = 'direction'
x_scaled, y_encoded, y_map, scaler_obj = data_prep(df, apply_smote = False)
joblib.dump(scaler_obj, 'data/model/scaler_obj.pkl') 
joblib.dump(y_map, 'data/model/y_map.pkl') 

Record count : 109200
--------------------------
same        73270
increase    18039
decrease    17891
Name: direction, dtype: int64
--------------------------
same        0.670971
increase    0.165192
decrease    0.163837
Name: direction, dtype: float64


['data/model/y_map.pkl']

## Neural Network

### Network building

In [6]:
train_x, valid_x, train_y, valid_y = train_test_split(x_scaled, y_encoded,train_size = 0.8,random_state = 1)

In [7]:
model = Sequential([
    Dense(units = 2048, activation='relu', input_shape=(len(train_x[0]),)),
    Dense(units = 1024, activation='relu',),
    Dense(units = 512, activation='relu'),    
    Dense(units = 256, activation='relu'),
    Dense(units = 128, activation='relu'),
    Dense(units = 3, activation='softmax')
])

model.compile(    
    loss=CategoricalCrossentropy(),
    optimizer=Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.01),
    metrics=CategoricalAccuracy())

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 2048)              51200     
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 387       
Total params: 2,838,787
Trainable params: 2,838,787
Non-trainable params: 0
______________________________________________

### Training

In [8]:
epoch_val         = 10
batch_size_val    = 128

verbose_val       = 2
workers_val       = -1

In [9]:
model.fit(x=train_x, 
          y=train_y, 
          epochs=epoch_val,
          batch_size = batch_size_val,
          validation_data = (valid_x, valid_y),
          workers= workers_val,
          verbose=verbose_val)

Epoch 1/10
683/683 - 2s - loss: 0.5363 - categorical_accuracy: 0.7668 - val_loss: 0.5272 - val_categorical_accuracy: 0.7713
Epoch 2/10
683/683 - 2s - loss: 0.5141 - categorical_accuracy: 0.7756 - val_loss: 0.5247 - val_categorical_accuracy: 0.7720
Epoch 3/10
683/683 - 2s - loss: 0.5117 - categorical_accuracy: 0.7764 - val_loss: 0.5241 - val_categorical_accuracy: 0.7735
Epoch 4/10
683/683 - 2s - loss: 0.5102 - categorical_accuracy: 0.7774 - val_loss: 0.5234 - val_categorical_accuracy: 0.7736
Epoch 5/10
683/683 - 2s - loss: 0.5094 - categorical_accuracy: 0.7770 - val_loss: 0.5232 - val_categorical_accuracy: 0.7724
Epoch 6/10
683/683 - 2s - loss: 0.5086 - categorical_accuracy: 0.7775 - val_loss: 0.5232 - val_categorical_accuracy: 0.7722
Epoch 7/10
683/683 - 2s - loss: 0.5081 - categorical_accuracy: 0.7778 - val_loss: 0.5228 - val_categorical_accuracy: 0.7737
Epoch 8/10
683/683 - 2s - loss: 0.5077 - categorical_accuracy: 0.7779 - val_loss: 0.5227 - val_categorical_accuracy: 0.7729
Epoch 9/

<tensorflow.python.keras.callbacks.History at 0x27adab3fa90>

### DNN

In [10]:
predictions = model.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

              precision    recall  f1-score   support

    decrease       0.71      0.54      0.61      3561
    increase       0.70      0.53      0.60      3673
        same       0.79      0.89      0.84     14606

    accuracy                           0.77     21840
   macro avg       0.74      0.65      0.69     21840
weighted avg       0.77      0.77      0.76     21840



In [11]:
model.save('data/model/model_dnn.h5') 
model_new = load_model('data/model/model_dnn.h5')

### DNN Prediction

In [12]:
df1 = pd.read_csv('data/yearly_tick_data/tab_2019.csv')
df1.drop(remove_cols, axis=1, inplace=True)

x_scaled1, y_encoded1, y_map1, scaler_obj_1 = data_prep(df1, apply_smote = False)

predictions = model_new.predict(x_scaled1)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(y_encoded1, axis = -1)

df1['predictions'] = [y_map1[k] for k in rounded_predictions]
#df1.to_csv('data/tab_tick_2018_res.csv')

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map1.values()))

fig, ax = plt.subplots(figsize=(8, 8))
cm = confusion_matrix(rounded_valid_y, rounded_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_map1.values())
disp.plot(ax=ax)

Record count : 97275
--------------------------
same        83200
decrease     7307
increase     6768
Name: direction, dtype: int64
--------------------------
same        0.855307
decrease    0.075117
increase    0.069576
Name: direction, dtype: float64


ValueError: in user code:

    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\training.py:1462 predict_function  *
        return step_function(self, iterator)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\training.py:1452 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\training.py:1445 run_step  **
        outputs = model.predict_step(data)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\training.py:1418 predict_step
        return self(x, training=False)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\base_layer.py:976 __call__
        self.name)
    C:\Users\91989\AppData\Roaming\Python\Python36\site-packages\tensorflow\python\keras\engine\input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 24 but received input with shape [None, 22]


In [None]:
live_df = pd.read_csv('data/live_preds.csv')
live_df.drop(remove_cols, axis=1, inplace=True)
del live_df['predicted_direction']

model_new = load_model('data/model/model_dnn.h5')    
scaler_obj = joblib.load('data/model/scaler_obj.pkl')
y_map = joblib.load('data/model/y_map.pkl') 

x_scaled = scaler_obj.transform(live_df.values)
predictions = model_new.predict(x_scaled) 
rounded_predictions = np.argmax(predictions, axis = -1)

#live_df['predictions'] = y_map[data['rounded_predictions'][0]]
live_df['predictions'] = [y_map[k] for k in rounded_predictions]

print(y_map)
live_df['predictions'].value_counts()