### Load the dependencies

In [1]:
# import xlsxwriter
import pylightxl as xl
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pickle

import pn_config

from sklearn.preprocessing import MinMaxScaler

## Load the dataset

### Read the data file

In [2]:
# readxl returns a pylightxl database that holds all worksheets and its data
db = xl.readxl(fn=f'{pn_config.project_path}/{pn_config.excel_file}')

### Display the column headings of the dataset

In [3]:
print(db.ws(ws=f'{pn_config.sheet_name}').row(row=1))

['Time', 'Returns', 'dp', 'dy', 'ep', 'de', 'svar', 'bm', 'ntis', 'tbl', 'lty', 'ltr', 'tms', 'dfy', 'dfr', 'infl']


### Load the rows into a list

In [4]:
file_rows = []

for row in db.ws(ws=f'{pn_config.sheet_name}').rows:
       file_rows.append(row)


### Load the rows into a pandas dataframe

In [5]:
df = pd.DataFrame(file_rows[1:])
df.columns = file_rows[0]
df.head()

Unnamed: 0,Time,Returns,dp,dy,ep,de,svar,bm,ntis,tbl,lty,ltr,tms,dfy,dfr,infl
0,1945-01,1,-3.041609,-3.027403,-2.66234,-0.379269,0.000924,0.735342,0.016454,0.0038,0.024,0.0127,0.0202,0.0077,-0.0051,0.0
1,1945-02,0,-3.096132,-3.036338,-2.711553,-0.384579,0.000655,0.704489,0.014836,0.0038,0.0236,0.0077,0.0198,0.0076,-0.0031,0.0
2,1945-03,1,-3.04379,-3.091042,-2.653829,-0.389961,0.001887,0.767883,0.015963,0.0038,0.0236,0.0021,0.0198,0.0076,-0.0003,0.0
3,1945-04,1,-3.128109,-3.04379,-2.724389,-0.40372,0.001398,0.715063,0.015086,0.0038,0.0228,0.016,0.019,0.0075,-0.0142,0.0
4,1945-05,0,-3.1395,-3.128109,-2.722106,-0.417394,0.000921,0.702911,0.019773,0.0038,0.0226,0.0056,0.0188,0.007,-0.0067,0.005618


### Drop dataframe rows with null values

In [None]:
df.dropna(axis=0, how='any', inplace=True)

### Check if the classes are unbalanced

In [6]:
len(df[df["Returns"] == 0]), len(df[df["Returns"] == 1]) 

(383, 516)

### Create the features database, X

In [7]:
X = df.drop(columns=["Time","Returns"])
X

Unnamed: 0,dp,dy,ep,de,svar,bm,ntis,tbl,lty,ltr,tms,dfy,dfr,infl
0,-3.041609,-3.027403,-2.662340,-0.379269,0.000924,0.735342,0.016454,0.0038,0.0240,0.0127,0.0202,0.0077,-0.0051,0.000000
1,-3.096132,-3.036338,-2.711553,-0.384579,0.000655,0.704489,0.014836,0.0038,0.0236,0.0077,0.0198,0.0076,-0.0031,0.000000
2,-3.043790,-3.091042,-2.653829,-0.389961,0.001887,0.767883,0.015963,0.0038,0.0236,0.0021,0.0198,0.0076,-0.0003,0.000000
3,-3.128109,-3.043790,-2.724389,-0.403720,0.001398,0.715063,0.015086,0.0038,0.0228,0.0160,0.0190,0.0075,-0.0142,0.000000
4,-3.139500,-3.128109,-2.722106,-0.417394,0.000921,0.702911,0.019773,0.0038,0.0226,0.0056,0.0188,0.0070,-0.0067,0.005618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,-3.966309,-3.953266,-3.098391,-0.867918,0.000594,0.233834,-0.012703,0.0210,0.0206,0.0024,-0.0004,0.0099,0.0060,0.001671
895,-3.941330,-3.959587,-3.086025,-0.855304,0.004318,0.237917,-0.010244,0.0195,0.0163,0.0797,-0.0032,0.0089,-0.0059,-0.000051
896,-3.951689,-3.934654,-3.108987,-0.842702,0.000605,0.233377,-0.010959,0.0189,0.0170,-0.0192,-0.0019,0.0088,0.0002,0.000783
897,-3.965984,-3.945758,-3.112869,-0.853115,0.001510,0.232261,-0.013267,0.0165,0.0171,-0.0052,0.0006,0.0091,0.0058,0.002286


### Create the target column, Y

In [8]:
y = df["Returns"]

### Split the dataset into training and testing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.05)

### Create variables that will hold the information for the best performing model

In [10]:
global max_score, max_model_name, max_model
max_score = 0
max_model = None
max_model_name = ""

### Create a function that implements the following steps:
* that fits a given model to the data; both passed in as arguments 
* test the training and testing performances of the model
* display the confusion matrix for actual vs predicted for the test dataset
* if the test performance is better than the current best performance, update the information for best performing model

In [11]:
def train_test(model, model_name, X_train, X_test, y_train, y_test):
    
    global max_score, max_model_name, max_model 
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    test_score = model.score(X_test, y_test)
    print("score on test: " + str(test_score))
    print("score on train: "+ str(model.score(X_train, y_train)))
    
    print(confusion_matrix(y_test, y_pred))
    
    if test_score > max_score:
        max_score = test_score
        max_model_name = model_name
        max_model = model
    

### Create a logistic regression model, then call train_test to implement the steps explained above

In [12]:
do_lr = True

if do_lr:

    lr = LogisticRegression()    
    train_test(lr, "Logistic Regression", X_train, X_test, y_train, y_test)

score on test: 0.6444444444444445
score on train: 0.5761124121779859
[[ 1 14]
 [ 2 28]]


### Create a Random Forest Classifier model, then call train_test to implement the steps explained above

In [13]:
do_rf = True

if do_rf:
    
    rf = RandomForestClassifier(n_estimators=300, criterion="gini", max_depth=10, n_jobs=5) 
    train_test(rf, "Random Forest", X_train, X_test, y_train, y_test)

score on test: 0.5555555555555556
score on train: 0.9461358313817331
[[ 4 11]
 [ 9 21]]


### Create a Support Vector - RBF model, then call train_test to implement the steps explained above

In [14]:
do_svc = True

if do_svc:
    
    svc = svm.SVC(kernel="rbf",max_iter=-1,C=10**9, gamma="auto")
    train_test(svc, "Support Vector - RBF", X_train, X_test, y_train, y_test)

score on test: 0.5777777777777777
score on train: 0.6768149882903981
[[ 7  8]
 [11 19]]


### Create a Naive Bayes model, then call train_test to implement the steps explained above

In [15]:
do_bayes = True

if do_bayes:
    scaler = MinMaxScaler()
    fit = scaler.fit(X_train)
    X_train_m = fit.transform(X_train)
    X_test_m = fit.transform(X_test)

    mnb = MultinomialNB()
    train_test(mnb, "Naive Bayes", X_train_m, X_test_m, y_train, y_test)

score on test: 0.6666666666666666
score on train: 0.5690866510538641
[[ 0 15]
 [ 0 30]]


### Create a K Nearest Neighbour model, then call train_test to implement the steps explained above

In [16]:
do_knn = True

if do_knn:

    knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
    train_test(knn, "K Nearest Neighbour", X_train, X_test, y_train, y_test)

score on test: 0.7111111111111111
score on train: 0.7330210772833724
[[12  3]
 [10 20]]


### Create a Support Vector - Linear model, then call train_test to implement the steps explained above

In [17]:
do_svm = True

if do_svm:
    
    svm=LinearSVC(C=0.0001)
    train_test(svm, "Support Vector - Linear", X_train, X_test, y_train, y_test)

score on test: 0.6666666666666666
score on train: 0.5690866510538641
[[ 0 15]
 [ 0 30]]


### Create a Decision Trees model, then call train_test to implement the steps explained above

In [18]:
do_dt = True

if do_dt:
    
    clf = DecisionTreeClassifier()
    train_test(clf, "Decision Trees",X_train, X_test, y_train, y_test)

score on test: 0.7555555555555555
score on train: 1.0
[[ 8  7]
 [ 4 26]]


### Create a Bagging Classifier model, then call train_test to implement the steps explained above

In [19]:
do_bg = True

if do_bg:
    
    bg=BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=1000)
    train_test(bg, "Bagging", X_train, X_test, y_train, y_test)

score on test: 0.5111111111111111
score on train: 0.9836065573770492
[[ 3 12]
 [10 20]]


### Create a AdaBoost Classifier model, then call train_test to implement the steps explained above

In [20]:
do_ab = True

if do_ab:
    
    adb = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=4),n_estimators=1000,learning_rate=0.6)
    train_test(bg, "AdaBoost", X_train, X_test, y_train, y_test)

score on test: 0.4888888888888889
score on train: 0.990632318501171
[[ 3 12]
 [11 19]]


### Create a Tensorflow Neural Network model, then call train_test to implement the steps explained above

In [21]:
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras.layers import Dropout

nn = True

if nn:
    # X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.05)
    x_partial_train, x_validation, y_partial_train, y_validation = train_test_split(X_train, y_train, test_size=0.3)
    model=models.Sequential()
    model.add(layers.Dense(4096,activation='relu',input_shape=(14,)))
    model.add(Dropout(0.2))
    model.add(layers.Dense(2048,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(1024,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(512,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(256,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(128,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(64,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(32,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(16,activation='relu'))
    model.add(Dropout(0.2))
    model.add(layers.Dense(1,activation='sigmoid'))

    sgd = optimizers.SGD(lr=0.01)

    model.compile(optimizer=sgd,loss='binary_crossentropy',metrics=['accuracy'])  # rmsprop
    
    model.fit(x_partial_train,y_partial_train,epochs=150,validation_data=(x_validation,y_validation))
    print("score on test: " + str(model.evaluate(X_test,y_test)[1]))

    y_pred = model.predict(X_test)
    y_pred = [0 if y < 0.5 else 1 for y in y_pred]
    
    print(sum(y_pred == y_test))
    print(len(y_test))
    
    y_test.reset_index(drop=True,inplace=True)
    print(confusion_matrix(y_test, y_pred))
    
    test_score = sum(y_pred == y_test)/len(y_test)
    
    if test_score > max_score:
        print("NN")
        max_score = test_score
        max_model_name = "Neural Network"
        max_model = model

2021-10-28 10:32:32.832113: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-28 10:32:32.832242: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-28 10:32:32.832438: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2021-10-28 10:32:32.952911: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-28 10:32:32.971808: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2299965000 Hz


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150
score on test: 0.6666666865348816
30
45
[[ 0 15]
 [ 0 30]]


### Save the mode so it can be retrieved by the prediction service

In [22]:
filename = "predict_now.sav"

with open(filename,'wb') as f:
    pickle.dump(max_model, f)

In [24]:
max_model_name, max_score

('Decision Trees', 0.7555555555555555)