In [2]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import datetime
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay


# using the m ∈ {{1,..20}, {40,60,...240}}

In [4]:
def df_cum_process(df):
    x = df.copy()
    x = x.iloc[:,:240]
    x[f'Cumulative_Return_40'] = x.iloc[:, 20:40].sum(axis=1)
    x[f'Cumulative_Return_60'] = x.iloc[:, 40:60].sum(axis=1)
    x[f'Cumulative_Return_80'] = x.iloc[:, 60:80].sum(axis=1)
    x[f'Cumulative_Return_100'] = x.iloc[:, 80:100].sum(axis=1)
    x[f'Cumulative_Return_120'] = x.iloc[:, 100:120].sum(axis=1)
    x[f'Cumulative_Return_140'] = x.iloc[:, 120:140].sum(axis=1)
    x[f'Cumulative_Return_160'] = x.iloc[:, 140:160].sum(axis=1)
    x[f'Cumulative_Return_180'] = x.iloc[:, 160:180].sum(axis=1)
    x[f'Cumulative_Return_200'] = x.iloc[:, 180:200].sum(axis=1)
    x[f'Cumulative_Return_220'] = x.iloc[:, 200:220].sum(axis=1)
    x[f'Cumulative_Return_240'] = x.iloc[:, 220:240].sum(axis=1)

    x.drop(x.iloc[:,20:240], inplace=True,axis=1)
    x.columns = x.columns.astype(str)
    y = df.copy()
    y = y.iloc[:,240:]
    final = pd.concat([x,y],axis=1)
    return final

In [5]:
features = 31
#label = list(range(timesteps)) + ['target'] + ['ticker'] + ['target_date'] + ['sector']

training_data = []
training_label = []
testing_data =[]
testing_label =[]

n_estimators = [10,100,500,1000]
max_features = ['sqrt','log2']
max_depth = [3,5,7,10,50]
max_depth.append(None)
bootstrap = [True, False]
criterion = ['gini', 'entropy']


for i in range(5):
    # read the data
    path= '/SP500/data/'
    train = pd.read_csv(path+'Set_' + str(i) + '_Train.csv', index_col=0).dropna()
    train = df_cum_process(train)
    test = pd.read_csv(path+'Set_' + str(i) + '_Test.csv', index_col=0).dropna()
    test = df_cum_process(test)

    #train.columns = label
    #test.columns = label
    x_train = train.iloc[:, :features]
    y_train= train.iloc[:, features]
    x_test = test.iloc[:, :features]
    y_test = test.iloc[:,features]
    
    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)
    
    print('completed'+str(i))
    
    print("-------------------------------------------------------------------------------------------------------")
    print("Training the model for Training Set " + str(i) + " from " +
    datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'))
    print("-------------------------------------------------------------------------------------------------------")

    # Create the random grids
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'criterion' :criterion,
                   'bootstrap': bootstrap}
    print(random_grid)        
    rf = RandomForestClassifier(n_estimators = 10, n_jobs=-1,random_state =0)
    rf_random = RandomizedSearchCV(rf, random_grid, n_iter=30, scoring='accuracy', n_jobs=-1, cv=3, 
                                   random_state =0, verbose =1, refit=True)
    rf_random.fit(x_train,y_train)
    print(rf_random.best_params_)
    rf = rf_random.best_estimator_
    y_pred = rf.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Model Saved")
    joblib.dump(rf, "random_forest_"+ str(i)+".joblib")
        
        
    print("Accuracy for round ", i, ": ", accuracy)
    #result
    predict_prob = rf.predict_proba(x_test)
    #pred = predict_prob.reshape((1, len(predict_prob))).tolist()[0]
    pred = predict_prob[:,1] #only for class one
    output_data = pd.DataFrame({'y_prob': pred, 'y_true': test['target'], 'Ticker': test['ticker'],
                                    'Date': test['target_date'], 'Sector': test['sector'], })
    pred_path = '/SP500/3_RF/rf_pred/'
    output_data.to_csv(pred_path+'rf_pred_'+str(i)+'.csv')
    print('Completed')

(245464, 31)
(245464,)
(120328, 31)
(120328,)
completed0
-------------------------------------------------------------------------------------------------------
Training the model for Training Set 0 from 2024-03-26 12:24:32
-------------------------------------------------------------------------------------------------------
{'n_estimators': [10, 100, 500, 1000], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 5, 7, 10, 50, None], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 30 candidates, totalling 90 fits




{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Model Saved
Accuracy for round  0 :  0.5025762914699821
Completed
(245171, 31)
(245171,)
(120080, 31)
(120080,)
completed1
-------------------------------------------------------------------------------------------------------
Training the model for Training Set 1 from 2024-03-26 13:03:25
-------------------------------------------------------------------------------------------------------
{'n_estimators': [10, 100, 500, 1000], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 5, 7, 10, 50, None], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 30 candidates, totalling 90 fits




{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Model Saved
Accuracy for round  1 :  0.5026315789473684
Completed
(245224, 31)
(245224,)
(120274, 31)
(120274,)
completed2
-------------------------------------------------------------------------------------------------------
Training the model for Training Set 2 from 2024-03-26 13:46:16
-------------------------------------------------------------------------------------------------------
{'n_estimators': [10, 100, 500, 1000], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 5, 7, 10, 50, None], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 30 candidates, totalling 90 fits




{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Model Saved
Accuracy for round  2 :  0.5024693616242912
Completed
(245140, 31)
(245140,)
(119243, 31)
(119243,)
completed3
-------------------------------------------------------------------------------------------------------
Training the model for Training Set 3 from 2024-03-26 14:26:25
-------------------------------------------------------------------------------------------------------
{'n_estimators': [10, 100, 500, 1000], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 5, 7, 10, 50, None], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 30 candidates, totalling 90 fits




{'n_estimators': 1000, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', 'bootstrap': False}
Model Saved
Accuracy for round  3 :  0.5005073673087729
Completed
(244381, 31)
(244381,)
(119325, 31)
(119325,)
completed4
-------------------------------------------------------------------------------------------------------
Training the model for Training Set 4 from 2024-03-26 15:05:38
-------------------------------------------------------------------------------------------------------
{'n_estimators': [10, 100, 500, 1000], 'max_features': ['sqrt', 'log2'], 'max_depth': [3, 5, 7, 10, 50, None], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]}
Fitting 3 folds for each of 30 candidates, totalling 90 fits




{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Model Saved
Accuracy for round  4 :  0.500322648229625
Completed


In [6]:
# Get feature importances
feature_importances = rf.feature_importances_

# Map feature importances to column names
column_names = x_train.columns 
feature_importance_dict = dict(zip(column_names, feature_importances))

# Sort feature importances in descending order
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print or plot feature importances
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.6f}")

Cumulative_Return_240: 0.032826
Cumulative_Return_180: 0.032576
Cumulative_Return_160: 0.032424
Cumulative_Return_140: 0.032403
Cumulative_Return_200: 0.032377
Cumulative_Return_220: 0.032365
Normalized_Price_Return.13: 0.032331
Normalized_Price_Return.18: 0.032307
Normalized_Price_Return: 0.032286
Normalized_Price_Return.11: 0.032280
Cumulative_Return_100: 0.032266
Normalized_Price_Return.4: 0.032264
Normalized_Price_Return.17: 0.032263
Cumulative_Return_120: 0.032258
Normalized_Price_Return.3: 0.032231
Normalized_Price_Return.10: 0.032207
Normalized_Price_Return.2: 0.032203
Normalized_Price_Return.5: 0.032198
Normalized_Price_Return.9: 0.032187
Normalized_Price_Return.1: 0.032184
Normalized_Price_Return.19: 0.032177
Normalized_Price_Return.7: 0.032164
Normalized_Price_Return.6: 0.032162
Normalized_Price_Return.14: 0.032151
Normalized_Price_Return.15: 0.032142
Cumulative_Return_60: 0.032139
Normalized_Price_Return.8: 0.032137
Cumulative_Return_40: 0.032136
Normalized_Price_Return.16: 