In [438]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [439]:
df = pd.read_csv('gold_posneg.csv', index_col="Date", infer_datetime_format=True, parse_dates=True)
df = df.dropna()
df = df.drop(columns=[])
df

Unnamed: 0_level_0,High,Low,High/Close,Low/Close,Spread,Spread Change,Volume,Volume Diff,Volume change,Return ewm,...,Gold Close,gld Return,CAD Close,Cad Return,TSX Close,Tsx Return,Return,Bar Close,Day of the Week,Pos
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-06,37.424048,36.465993,0.013201,-0.012737,0.958055,0.204300,2944700.0,-51300.0,-0.017123,0.014154,...,1135.900024,0.015920,1.03190,-0.007311,11944.500000,0.002772,0.016478,36.936466,2.0,1.0
2010-01-07,36.782484,36.226468,0.008443,-0.006801,0.556016,-0.419641,2420100.0,-524600.0,-0.178151,-0.003619,...,1133.099976,-0.002465,1.03400,0.002035,11887.500000,0.000258,-0.012506,36.474537,3.0,1.0
2010-01-08,36.842383,36.089624,0.007721,-0.012869,0.752759,0.353845,2648000.0,227900.0,0.094170,0.000357,...,1138.199951,0.004501,1.02980,-0.004062,11953.799805,0.002031,0.002346,36.560101,4.0,0.0
2010-01-11,37.637908,36.534432,0.023256,-0.006744,1.103476,0.465908,2363100.0,-284900.0,-0.107591,0.004175,...,1150.699951,0.010982,1.03380,0.003884,11947.099609,0.001167,0.006083,36.782501,0.0,0.0
2010-01-12,36.825265,35.525046,0.026956,-0.009303,1.300219,0.178294,3054900.0,691800.0,0.292751,-0.015353,...,1128.900024,-0.018945,1.03920,0.005223,11820.200195,-0.002763,-0.025116,35.858654,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-13,30.389999,29.940001,0.014691,-0.000334,0.449999,-0.274196,3180400.0,-1345000.0,-0.297211,-0.002230,...,1853.599976,0.005806,1.27077,-0.004988,17934.699219,0.000252,0.000000,29.950001,2.0,0.0
2021-01-14,30.320000,29.760000,0.013708,-0.005015,0.559999,0.244446,3226400.0,46000.0,0.014464,-0.001634,...,1850.300049,-0.001780,1.26981,-0.000755,17958.099609,0.000603,-0.001336,29.910000,3.0,0.0
2021-01-15,30.160000,29.430000,0.023761,-0.001018,0.730000,0.303572,3259100.0,32700.0,0.010135,-0.010575,...,1829.300049,-0.011350,1.26477,-0.003969,17909.000000,-0.000509,-0.015045,29.459999,4.0,1.0
2021-01-19,29.889999,29.459999,0.002684,-0.011741,0.430000,0.303032,3225700.0,2411300.0,2.960830,0.004762,...,1839.500000,0.005576,1.27480,-0.001308,17957.400391,0.000451,0.007435,29.809999,1.0,0.0


In [440]:
X = df[["Return ewm", "Return", "Tsx Return", "Low/Close", "gld Return", "Spread Change"]]
X.head()

Unnamed: 0_level_0,Return ewm,Return,Tsx Return,Low/Close,gld Return,Spread Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-06,0.014154,0.016478,0.002772,-0.012737,0.01592,0.2043
2010-01-07,-0.003619,-0.012506,0.000258,-0.006801,-0.002465,-0.419641
2010-01-08,0.000357,0.002346,0.002031,-0.012869,0.004501,0.353845
2010-01-11,0.004175,0.006083,0.001167,-0.006744,0.010982,0.465908
2010-01-12,-0.015353,-0.025116,-0.002763,-0.009303,-0.018945,0.178294


In [441]:
y = df["Pos"].values.reshape(-1, 1)
y[:5]


array([[1.],
       [1.],
       [0.],
       [0.],
       [0.]])

In [442]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [443]:
scaler = StandardScaler()

x_train_scaler = scaler.fit(x_train)
x_test_scaler = scaler.fit(x_test)

x_train = x_train_scaler.transform(x_train)
x_test = x_test_scaler.transform(x_test)


In [458]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=30,
                                            learning_rate=learning_rate,
                                            max_features=6,
                                            max_depth=4,
                                            random_state=42)

    # Fit the model
    classifier.fit(x_train, y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            x_train,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            x_test,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (training): 0.792
Accuracy score (validation): 0.739

Learning rate:  0.1
Accuracy score (training): 0.841
Accuracy score (validation): 0.749

Learning rate:  0.25
Accuracy score (training): 0.882
Accuracy score (validation): 0.756

Learning rate:  0.5
Accuracy score (training): 0.929
Accuracy score (validation): 0.729

Learning rate:  0.75
Accuracy score (training): 0.941
Accuracy score (validation): 0.732

Learning rate:  1
Accuracy score (training): 0.964
Accuracy score (validation): 0.696



In [460]:
classifier = GradientBoostingClassifier(n_estimators=30,
                                        learning_rate=0.25,
                                        max_features=6,
                                        max_depth=4,
                                        random_state=0)

classifier.fit(x_train, y_train.ravel())

predictions = classifier.predict(x_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()}).head(20)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,1.0,1.0
2,0.0,0.0
3,0.0,1.0
4,1.0,0.0
5,0.0,0.0
6,0.0,0.0
7,1.0,1.0
8,0.0,0.0
9,0.0,1.0


In [461]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

acc_score = accuracy_score(y_test, predictions)


In [462]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,343,80
Actual 1,116,270


Accuracy Score : 0.757725587144623
Classification Report
              precision    recall  f1-score   support

         0.0       0.75      0.81      0.78       423
         1.0       0.77      0.70      0.73       386

    accuracy                           0.76       809
   macro avg       0.76      0.76      0.76       809
weighted avg       0.76      0.76      0.76       809



In [463]:
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.24647248012168255, 'Return ewm'),
 (0.23722369625159054, 'Return'),
 (0.1307386870736431, 'Spread Change'),
 (0.13050780538661347, 'Tsx Return'),
 (0.12773981572158374, 'Low/Close'),
 (0.1273175154448865, 'gld Return')]