# Modelling

## Packages

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.utils import np_utils

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

from imblearn.over_sampling import SMOTE

from matplotlib import pyplot as plt

print(f" Found and Using {len(tf.config.experimental.list_physical_devices('GPU'))} GPU")

 Found and Using 1 GPU


## Functions

In [2]:
def scale_x_encode_y(x, y):

    x_scaled = StandardScaler().fit_transform(x.values)    
    
    encoder = LabelEncoder()
    encoder.fit(y)
    y_encoded = encoder.transform(y)
    y_encoded = np_utils.to_categorical(y_encoded)
    
    y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
 
    return(x_scaled, y_encoded, y_map)

In [11]:
def smote_data(x, y):
    oversample = SMOTE()
    smote_x, smote_y = oversample.fit_resample(x, y)

    return(smote_x, smote_y)

In [7]:
def split_data(df):
    df_tot = df.copy()
    print(f'df_tot count : {len(df_tot)}')

    df1 = df_tot.iloc[:int(len(df_tot)/2)]
    print(f'df1 count : {len(df1)}')

    df2 = df_tot.iloc[int(len(df_tot)/2):]
    print(f'df2 count : {len(df2)}')
    
    return(df_tot, df1, df2)

In [12]:
def data_prep(df, apply_smote = False):
    x = df.loc[:, df.columns != target_col]
    y = df[target_col]

    if apply_smote:
        x, y = smote_data(x, y)

    print(f'Record count : {len(y)}')
    print('--------------------------')
    print(y.value_counts())
    print('--------------------------')
    print(y.value_counts(normalize=True))
    
    x_scaled, y_encoded, y_map = scale_x_encode_y(x, y)
    pd.DataFrame(x_scaled).head()
    
    return(x_scaled, y_encoded, y_map)

## Read Data

In [13]:
#df = pd.read_csv('data/IRIS.csv')
#df = pd.read_csv('data/rolled_df.csv')

df = pd.read_csv('data/tab_df.csv')

target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

df.head(5)

Record count : 186166
--------------------------
same        180369
decrease      2992
increase      2805
Name: target, dtype: int64
--------------------------
same        0.968861
decrease    0.016072
increase    0.015067
Name: target, dtype: float64


Unnamed: 0,tick,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,lsma,sema,lema,target
0,1.14541,9e-05,9e-05,0.0,2.9e-05,4.3e-05,0.683333,40.594059,1.1453,1.146044,1.145337,1.145868,same
1,1.14547,6e-05,6e-05,0.0,3.4e-05,4.3e-05,0.783333,43.925234,1.145296,1.146039,1.145361,1.145859,same
2,1.14541,-6e-05,0.0,6e-05,3.4e-05,4.7e-05,0.712121,41.59292,1.145295,1.146032,1.14537,1.145849,same
3,1.14533,-8e-05,0.0,8e-05,3.3e-05,5.3e-05,0.621622,38.333333,1.145301,1.146019,1.145363,1.145837,same
4,1.14526,-7e-05,0.0,7e-05,3.1e-05,5.8e-05,0.54321,35.2,1.145292,1.146011,1.145344,1.145824,same


In [14]:
df_tot, df1, df2 = split_data(df)

df_tot count : 186166
df1 count : 93083
df2 count : 93083


In [25]:
x_scaled, y_encoded, y_map = data_prep(df1, apply_smote = False)

Record count : 93083
--------------------------
same        89673
decrease     1813
increase     1597
Name: target, dtype: int64
--------------------------
same        0.963366
decrease    0.019477
increase    0.017157
Name: target, dtype: float64


In [26]:
x_scaled1, y_encoded1, y_map1 = data_prep(df2, apply_smote = False)

Record count : 93083
--------------------------
same        90696
increase     1208
decrease     1179
Name: target, dtype: int64
--------------------------
same        0.974356
increase    0.012978
decrease    0.012666
Name: target, dtype: float64


## Train test split

In [27]:
train_x, valid_x, train_y, valid_y = train_test_split(x_scaled, y_encoded,train_size = 0.8,random_state = 1)

## Neural Network

### Network building

In [28]:
model = Sequential([
    Dense(units = 16, activation='relu', input_shape=[len(train_x[0])]),
    Dense(units = 32, activation='relu'),
    Dense(units = 3, activation='softmax')
])

In [29]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),    
    metrics=['accuracy'])

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                208       
_________________________________________________________________
dense_4 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 99        
Total params: 851
Trainable params: 851
Non-trainable params: 0
_________________________________________________________________


### Parameters

In [31]:
epoch_val         = 5
batch_size_val    = 256

verbose_val       = 2
workers_val       = -1

### Training

In [32]:
model.fit(x=train_x, 
          y=train_y, 
          epochs=epoch_val,
          batch_size = batch_size_val,
          validation_data = (valid_x, valid_y),
          workers= workers_val,
         verbose=verbose_val)

Epoch 1/5
291/291 - 3s - loss: 0.2543 - accuracy: 0.9458 - val_loss: 0.1219 - val_accuracy: 0.9669
Epoch 2/5
291/291 - 2s - loss: 0.1045 - accuracy: 0.9672 - val_loss: 0.0860 - val_accuracy: 0.9712
Epoch 3/5
291/291 - 2s - loss: 0.0889 - accuracy: 0.9694 - val_loss: 0.0800 - val_accuracy: 0.9732
Epoch 4/5
291/291 - 2s - loss: 0.0855 - accuracy: 0.9698 - val_loss: 0.0782 - val_accuracy: 0.9738
Epoch 5/5
291/291 - 2s - loss: 0.0841 - accuracy: 0.9700 - val_loss: 0.0777 - val_accuracy: 0.9739


<tensorflow.python.keras.callbacks.History at 0x27d8c9006d8>

### DNN Results

In [33]:
predictions = model.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

              precision    recall  f1-score   support

    decrease       0.70      0.41      0.52       353
    increase       0.80      0.38      0.51       304
        same       0.98      0.99      0.99     17960

    accuracy                           0.97     18617
   macro avg       0.83      0.60      0.67     18617
weighted avg       0.97      0.97      0.97     18617



In [None]:
predictions = model.predict(x_scaled1)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(y_encoded1, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map1.values()))