In [13]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler

import pickle


In [14]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df = pd.read_csv('StockAIClean3.csv')
# Review the DataFrame
df.head()

Unnamed: 0,No.,Ticker,Company,Sector,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,...,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Team Rec Cat,Risk,Volume Cat,Var % Cat,Var%
0,1,AAPL,Apple Inc.,Technology,Mg Cap,Med,Yes,Yes,Yes,Yes,...,Lo,Pos,Up,Hi,Buy,Buy,Lo,Hi,Neg,-1.2
1,2,ACN,Accenture plc,Technology,Lg Cap,Med,Yes,Yes,Yes,Yes,...,Lo,Pos,Up,Hi,Buy,Buy,Lo,Med,Pos,5.3
2,3,ADBE,Adobe Inc.,Technology,Lg Cap,Hi,No,Yes,Yes,Yes,...,Lo,Pos,Up,Med,Buy,Buy,Lo,Med,Neg,-0.3
3,4,ADSK,"Autodesk, Inc.",Technology,Lg Cap,Med,No,Yes,Yes,Yes,...,Lo,Pos,Down,Med,Buy,Buy,Lo,Lo,Pos,9.1
4,5,AI,"C3.ai, Inc.",Technology,Mid Cap,Hi,No,No,Yes,No,...,Hi,Neg,Up,Low,Hold,Hold,Med,Hi,Neg,-36.0


BELOW MODEL WITH 6 TOP PARAMETERS PER FEATURE IMPORTANCE

In [15]:
# Based on feature importance results, we will only use top columns for our model
df2 = df[[
    'Performance (Year) (%) Cat',
    'Risk',
    'Sales Cat',
    'Employees Cat',
    'For P/E Cat',
    'Team Rec Cat'
]]
df2.head()

Unnamed: 0,Performance (Year) (%) Cat,Risk,Sales Cat,Employees Cat,For P/E Cat,Team Rec Cat
0,Up,Lo,Hi,Hi,Med,Buy
1,Up,Lo,Hi,Hi,Med,Buy
2,Up,Lo,Med,Med,Hi,Buy
3,Down,Lo,Med,Med,Med,Buy
4,Up,Med,Lo,Low,Hi,Hold


In [16]:
stock_df2 = pd.get_dummies(df2, columns=[ 
    'Performance (Year) (%) Cat',
    'Risk',
    'Sales Cat',
    'Employees Cat',
    'For P/E Cat',
    ])
stock_df2

Unnamed: 0,Team Rec Cat,Performance (Year) (%) Cat_Down,Performance (Year) (%) Cat_Up,Risk_Hi,Risk_Lo,Risk_Med,Sales Cat_Hi,Sales Cat_Lo,Sales Cat_Med,Employees Cat_Hi,Employees Cat_Low,Employees Cat_Med,For P/E Cat_Hi,For P/E Cat_Low,For P/E Cat_Med
0,Buy,0,1,0,1,0,1,0,0,1,0,0,0,0,1
1,Buy,0,1,0,1,0,1,0,0,1,0,0,0,0,1
2,Buy,0,1,0,1,0,0,0,1,0,0,1,1,0,0
3,Buy,1,0,0,1,0,0,0,1,0,0,1,0,0,1
4,Hold,0,1,0,0,1,0,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Hold,0,1,0,0,1,1,0,0,0,0,1,1,0,0
68,Buy,0,1,0,1,0,0,1,0,0,1,0,1,0,0
69,Sell,1,0,1,0,0,0,1,0,0,1,0,0,1,0
70,Hold,0,1,0,0,1,0,0,1,0,0,1,1,0,0


In [17]:
# Split the data into features (X) and target (y)
X = stock_df2.drop('Team Rec Cat', axis=1)
y = stock_df2['Team Rec Cat']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [18]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [19]:
y_pred = rf.predict(X_test)
rf_result = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
rf_result

Unnamed: 0,Prediction,Actual
0,Buy,Buy
1,Hold,Hold
2,Hold,Hold
3,Buy,Buy
4,Sell,Sell
5,Buy,Buy
6,Buy,Buy
7,Hold,Hold
8,Hold,Hold
9,Buy,Buy


In [20]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [21]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Prediction 2"]
)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test, y_pred))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Prediction 2
Actual 0,7,0,0
Actual 1,0,7,0
Actual 2,0,0,4


Classification Report
              precision    recall  f1-score   support

         Buy       1.00      1.00      1.00         7
        Hold       1.00      1.00      1.00         7
        Sell       1.00      1.00      1.00         4

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [22]:
# # save the model to curent directory
# filename = 'rf_model.pkl'
# pickle.dump(rf, open(filename, 'wb'))

In [23]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)
# loaded_model

In [24]:
import pickle

with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf,f)