# Modelling

## Packages

In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.utils import np_utils

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

from imblearn.over_sampling import SMOTE

from matplotlib import pyplot as plt

print(f" Found and Using {len(tf.config.experimental.list_physical_devices('GPU'))} GPU")

 Found and Using 1 GPU


## Functions

In [18]:
def scale_x_encode_y(x, y):

    x_scaled = StandardScaler().fit_transform(x.values)    
    
    encoder = LabelEncoder()
    encoder.fit(y)
    y_encoded = encoder.transform(y)
    y_encoded = np_utils.to_categorical(y_encoded)
    
    y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
 
    return(x_scaled, y_encoded, y_map)

In [19]:
def smote_data(x, y):
    oversample = SMOTE()
    smote_x, smote_y = oversample.fit_resample(x, y)

    print(f'Record count : {len(smote_y)}')
    print('--------------------------')
    print(smote_y.value_counts())
    print('--------------------------')
    print(smote_y.value_counts(normalize=True))
    
    return(smote_x, smote_y)

## Read Data

In [20]:
#df = pd.read_csv('data/IRIS.csv')
#df = pd.read_csv('data/rolled_df.csv')
df = pd.read_csv('data/tab_df.csv')

In [21]:
target_col = 'target'

print(f'Record count : {len(df)}')
print('--------------------------')
print(df[target_col].value_counts())
print('--------------------------')
print(df[target_col].value_counts(normalize=True))

x = df.loc[:, df.columns != target_col]
y = df[target_col]

df.head(5)

Record count : 186166
--------------------------
same        180369
decrease      2992
increase      2805
Name: target, dtype: int64
--------------------------
same        0.968861
decrease    0.016072
increase    0.015067
Name: target, dtype: float64


Unnamed: 0,tick,diff,gain,loss,avg_gain,avg_loss,rs,rsi,ssma,lsma,sema,lema,target
0,1.14541,9e-05,9e-05,0.0,2.9e-05,4.3e-05,0.683333,40.594059,1.1453,1.146044,1.145337,1.145868,same
1,1.14547,6e-05,6e-05,0.0,3.4e-05,4.3e-05,0.783333,43.925234,1.145296,1.146039,1.145361,1.145859,same
2,1.14541,-6e-05,0.0,6e-05,3.4e-05,4.7e-05,0.712121,41.59292,1.145295,1.146032,1.14537,1.145849,same
3,1.14533,-8e-05,0.0,8e-05,3.3e-05,5.3e-05,0.621622,38.333333,1.145301,1.146019,1.145363,1.145837,same
4,1.14526,-7e-05,0.0,7e-05,3.1e-05,5.8e-05,0.54321,35.2,1.145292,1.146011,1.145344,1.145824,same


## Smote data

In [22]:
smote_x, smote_y = smote_data(x, y)

Record count : 541107
--------------------------
same        180369
increase    180369
decrease    180369
Name: target, dtype: int64
--------------------------
same        0.333333
increase    0.333333
decrease    0.333333
Name: target, dtype: float64


## Scale and encode data

In [23]:
x_scaled, y_encoded, y_map = scale_x_encode_y(smote_x, smote_y)
pd.DataFrame(x_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.85158,0.301954,-0.07838,-0.555825,-0.822456,-0.646885,-0.019506,-0.414674,1.842539,1.894428,1.845742,1.882994
1,1.856249,0.200705,-0.236844,-0.555825,-0.764301,-0.646885,-0.0195,-0.269236,1.842228,1.894033,1.847623,1.882289
2,1.85158,-0.204292,-0.553772,-0.236101,-0.764301,-0.587371,-0.019504,-0.371064,1.84215,1.893529,1.848313,1.881496
3,1.845355,-0.271792,-0.553772,-0.129527,-0.773994,-0.50802,-0.019509,-0.513377,1.842617,1.892506,1.847746,1.880582
4,1.839907,-0.238042,-0.553772,-0.182814,-0.793379,-0.438587,-0.019514,-0.650177,1.841917,1.891878,1.846292,1.879569


## Train test split

In [24]:
train_x, valid_x, train_y, valid_y = train_test_split(x_scaled, y_encoded,train_size = 0.8,random_state = 1)

## Neural Network

### Network building

In [25]:
model = Sequential([
    Dense(units = 16, activation='relu', input_shape=[len(train_x[0])]),
    Dense(units = 32, activation='relu'),
    Dense(units = 3, activation='softmax')
])

In [26]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),    
    metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                208       
_________________________________________________________________
dense_4 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 99        
Total params: 851
Trainable params: 851
Non-trainable params: 0
_________________________________________________________________


### Parameters

In [28]:
epoch_val         = 5
batch_size_val    = 256

verbose_val       = 2
workers_val       = -1

### Training

In [29]:
model.fit(x=train_x, 
          y=train_y, 
          epochs=epoch_val,
          batch_size = batch_size_val,
          validation_data = (valid_x, valid_y),
          workers= workers_val,
         verbose=verbose_val)

Epoch 1/5
1691/1691 - 4s - loss: 0.2650 - accuracy: 0.9097 - val_loss: 0.2260 - val_accuracy: 0.9202
Epoch 2/5
1691/1691 - 3s - loss: 0.2197 - accuracy: 0.9219 - val_loss: 0.2183 - val_accuracy: 0.9224
Epoch 3/5
1691/1691 - 4s - loss: 0.2124 - accuracy: 0.9244 - val_loss: 0.2116 - val_accuracy: 0.9252
Epoch 4/5
1691/1691 - 3s - loss: 0.2050 - accuracy: 0.9269 - val_loss: 0.2043 - val_accuracy: 0.9264
Epoch 5/5
1691/1691 - 4s - loss: 0.1963 - accuracy: 0.9302 - val_loss: 0.1955 - val_accuracy: 0.9298


<tensorflow.python.keras.callbacks.History at 0x1d58840e588>

### DNN Results

In [30]:
predictions = model.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

              precision    recall  f1-score   support

    decrease       0.92      0.96      0.94     36157
    increase       0.94      0.94      0.94     35972
        same       0.93      0.89      0.91     36093

    accuracy                           0.93    108222
   macro avg       0.93      0.93      0.93    108222
weighted avg       0.93      0.93      0.93    108222



## RF Results

%%time
clf = RandomForestClassifier() 
clf.fit(train_x, train_y)
predictions = clf.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)
print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

### RF Feature importance

x_cols = df.columns[df.columns != 'target']
fig = plt.figure()
fig.suptitle('Feature importance', fontsize=20)
plt.xlabel('Importance %', fontsize=16)
plt.ylabel('Features', fontsize=16)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=15)


feat_importances = pd.Series(clf.feature_importances_ * 100, index=x_cols)
feat_importances.nlargest(15).plot(kind='barh', figsize=(15, 10), rot=45)

for index, value in enumerate(np.round(-np.sort(-clf.feature_importances_ * 100))):
    plt.text(1.05*value, index, str(value), fontsize=12)