# Modelling

## Packages

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import winsound
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.utils import np_utils

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

from imblearn.over_sampling import SMOTE

from matplotlib import pyplot as plt

print(f" Found and Using {len(tf.config.experimental.list_physical_devices('GPU'))} GPU")

 Found and Using 1 GPU


## Functions

In [2]:
def scale_x_encode_y(x, y):

    x_scaled = StandardScaler().fit_transform(x.values)    
    
    encoder = LabelEncoder()
    encoder.fit(y)
    y_encoded = encoder.transform(y)
    y_encoded = np_utils.to_categorical(y_encoded)
    
    y_map = dict(zip(encoder.transform(encoder.classes_),encoder.classes_))
 
    return(x_scaled, y_encoded, y_map)

def smote_data(x, y):
    oversample = SMOTE()
    smote_x, smote_y = oversample.fit_resample(x, y)

    return(smote_x, smote_y)

def split_data(df, divisor):
    df_tot = df.copy()
    print(f'df_tot count : {len(df_tot)}')

    df1 = df_tot.iloc[int(len(df_tot)/divisor):]
    print(f'df1 count : {len(df1)}')

    df2 = df_tot.iloc[:int(len(df_tot)/divisor)]
    print(f'df2 count : {len(df2)}')
    
    return(df_tot, df1, df2)

def data_prep(df, apply_smote = False):
    x = df.loc[:, df.columns != target_col]
    y = df[target_col]

    if apply_smote:
        x, y = smote_data(x, y)

    print(f'Record count : {len(y)}')
    print('--------------------------')
    print(y.value_counts())
    print('--------------------------')
    print(y.value_counts(normalize=True))
    
    x_scaled, y_encoded, y_map = scale_x_encode_y(x, y)
    pd.DataFrame(x_scaled).head()
    
    return(x_scaled, y_encoded, y_map)

## Read Data

In [3]:
col_list = ['Open', 'High', 'Low', 'Close', 
 'diff', 'gain', 'loss', 'avg_gain', 'avg_loss', 
 'rs', 'rsi', 
 'ssma', 'lsma', 'sma_diff', 
 'sema', 'lema', 'ema_diff', 
 'slope_s', 
 'target']

In [4]:
col_list = ['diff', 'avg_gain', 'avg_loss', 
 'rs', 'rsi', 
 'sma_diff', 
 'ema_diff', 
 'slope_s', 
 'target']

In [5]:
#df = pd.read_csv('data/tab_df.csv')
#df = pd.read_csv('data/tab_M1_2019.csv')
df = pd.read_csv('data/tab_tick_2019.csv')

#df = df[col_list]

df.head(5)

Unnamed: 0,avg_gain,avg_loss,close,diff,direction,gain,high,high_diff,loss,low,low_diff,open,rs,rsi
0,4.5e-05,8.3e-05,1.14632,-1.5e-05,decrease,0.0,1.146425,3e-05,0.00015,1.146315,0.000105,1.146335,0.551515,35.546875
1,6.3e-05,8.3e-05,1.14623,-9e-05,increase,0.000175,1.14632,-0.000105,0.0,1.14617,-0.000145,1.146315,0.763636,43.298969
2,6.3e-05,5.4e-05,1.14639,0.00016,same,0.0,1.146405,8.5e-05,0.0,1.146205,3.5e-05,1.14624,1.166667,53.846154
3,8.4e-05,3.2e-05,1.14632,-7e-05,increase,0.000215,1.146405,0.0,0.0,1.14631,0.000105,1.146385,2.640625,72.532189
4,6.5e-05,3.2e-05,1.146315,-5e-06,same,0.0,1.146535,0.00013,0.0,1.146215,-9.5e-05,1.146315,2.03125,67.010309


In [6]:
target_col = 'direction'
x_scaled, y_encoded, y_map = data_prep(df)

Record count : 486429
--------------------------
same        332616
decrease     77311
increase     76502
Name: direction, dtype: int64
--------------------------
same        0.683791
decrease    0.158936
increase    0.157273
Name: direction, dtype: float64


## Neural Network

### Network building

In [7]:
train_x, valid_x, train_y, valid_y = train_test_split(x_scaled, y_encoded,train_size = 0.8,random_state = 1)

In [8]:
model = Sequential([
    Dense(units = 16, activation='relu', input_shape=[len(train_x[0])]),
    Dense(units = 32, activation='relu'),
    Dense(units = 64, activation='relu'),
    Dense(units = 32, activation='relu'),
    Dense(units = 16, activation='relu'),
    Dense(units = 3, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),    
    metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                224       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_2 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 51        
Total params: 5,539
Trainable params: 5,539
Non-trainable params: 0
______________________________________________________

### Training

In [9]:
epoch_val         = 10
batch_size_val    = 512

verbose_val       = 2
workers_val       = -1

In [10]:
model.fit(x=train_x, 
          y=train_y, 
          epochs=epoch_val,
          batch_size = batch_size_val,
          validation_data = (valid_x, valid_y),
          workers= workers_val,
         verbose=verbose_val)

Epoch 1/10
761/761 - 5s - loss: 0.0441 - accuracy: 0.9866 - val_loss: 5.1967e-04 - val_accuracy: 0.9999
Epoch 2/10
761/761 - 4s - loss: 1.7177e-04 - accuracy: 1.0000 - val_loss: 1.1005e-04 - val_accuracy: 1.0000
Epoch 3/10
761/761 - 4s - loss: 1.0753e-05 - accuracy: 1.0000 - val_loss: 4.5812e-06 - val_accuracy: 1.0000
Epoch 4/10
761/761 - 4s - loss: 2.9599e-06 - accuracy: 1.0000 - val_loss: 2.9166e-06 - val_accuracy: 1.0000
Epoch 5/10
761/761 - 4s - loss: 1.7262e-06 - accuracy: 1.0000 - val_loss: 1.6027e-06 - val_accuracy: 1.0000
Epoch 6/10
761/761 - 4s - loss: 1.0688e-06 - accuracy: 1.0000 - val_loss: 9.7699e-07 - val_accuracy: 1.0000
Epoch 7/10
761/761 - 4s - loss: 6.7451e-07 - accuracy: 1.0000 - val_loss: 6.1165e-07 - val_accuracy: 1.0000
Epoch 8/10
761/761 - 4s - loss: 4.2968e-07 - accuracy: 1.0000 - val_loss: 4.0018e-07 - val_accuracy: 1.0000
Epoch 9/10
761/761 - 4s - loss: 2.7697e-07 - accuracy: 1.0000 - val_loss: 2.5874e-07 - val_accuracy: 1.0000
Epoch 10/10
761/761 - 4s - loss:

<tensorflow.python.keras.callbacks.History at 0x188e5005f60>

### DNN Validation

In [None]:
predictions = model.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

### DNN Prediction

In [None]:
df1 = pd.read_csv('data/tab_M1_2018.csv')
df1 = df1[col_list]
x_scaled1, y_encoded1, y_map1 = data_prep(df1, apply_smote = False)

predictions = model.predict(x_scaled1)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(y_encoded1, axis = -1)

df1['predictions'] = [y_map1[k] for k in rounded_predictions]
df1.to_csv('data/tab_M1_2018_res.csv')

print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map1.values()))

## RF Results

In [None]:
%%time
clf = RandomForestClassifier() 
clf.fit(train_x, train_y)
predictions = clf.predict(valid_x)
rounded_predictions = np.argmax(predictions, axis = -1)
rounded_valid_y = np.argmax(valid_y, axis = -1)
print(classification_report(rounded_valid_y, rounded_predictions, target_names = y_map.values()))

In [None]:
x_cols = df.columns[df.columns != 'direction']
fig = plt.figure()
fig.suptitle('Feature importance', fontsize=20)
plt.xlabel('Importance %', fontsize=16)
plt.ylabel('Features', fontsize=16)
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=15)


feat_importances = pd.Series(clf.feature_importances_ * 100, index=x_cols)
feat_importances.nlargest(24).plot(kind='barh', figsize=(15, 10), rot=45)

for index, value in enumerate(np.round(-np.sort(-clf.feature_importances_ * 100))):
    plt.text(value+0.45, index, str(value), fontsize=12)