In [1]:
import pandas as pd
from pathlib import Path
import sklearn as skl
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from collections import Counter
import pickle

In [2]:
stock_data = Path('StockAIClean2.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Sector,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,...,Sales growth past 5 years (%) Cat,Sales Cat,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Volume Cat,Var % Cat,Var%
0,1,AAPL,Apple Inc.,Technology,Mg Cap,Med,Yes,Yes,Yes,Yes,...,Yes,Hi,Lo,Pos,Up,Hi,Buy,Hi,Neg,-1.2
1,2,ACN,Accenture plc,Technology,Lg Cap,Med,Yes,Yes,Yes,Yes,...,Yes,Hi,Lo,Pos,Up,Hi,Buy,Med,Pos,5.3
2,3,ADBE,Adobe Inc.,Technology,Lg Cap,Hi,No,Yes,Yes,Yes,...,Yes,Med,Lo,Pos,Up,Med,Buy,Med,Neg,-0.3
3,4,ADSK,"Autodesk, Inc.",Technology,Lg Cap,Med,No,Yes,Yes,Yes,...,Yes,Med,Lo,Pos,Down,Med,Buy,Lo,Pos,9.1
4,5,AI,"C3.ai, Inc.",Technology,Mid Cap,Hi,No,No,Yes,No,...,Yes,Lo,Hi,Neg,Up,Sm,Hold,Hi,Neg,-36.0


In [3]:
stock_df1 = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'Sector'])
stock_df1.head()

Unnamed: 0,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,EPS growth next 5 years (%) Cat,Sales growth past 5 years (%) Cat,Sales Cat,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Volume Cat,Var % Cat,Var%
0,Mg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Hi,Neg,-1.2
1,Lg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Med,Pos,5.3
2,Lg Cap,Hi,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Up,Med,Buy,Med,Neg,-0.3
3,Lg Cap,Med,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Down,Med,Buy,Lo,Pos,9.1
4,Mid Cap,Hi,No,No,Yes,No,Yes,Yes,Lo,Hi,Neg,Up,Sm,Hold,Hi,Neg,-36.0


In [4]:
stock_df2 = pd.get_dummies(stock_df1, columns=["Mkt Cap Cat", "For P/E Cat", "Dividend", "EPS growth this year (%) Cat", "EPS growth next year (%) Cat", "EPS growth past 5 years (%) Cat", "EPS growth next 5 years (%) Cat", "Sales growth past 5 years (%) Cat", "Sales Cat", "Float Short (%) Cat", "Profit Margin (%) Cat", "Performance (Year) (%) Cat", "Employees Cat", "Volume Cat", "Var % Cat"])
stock_df2

Unnamed: 0,Analyst Rec Cat,Var%,Mkt Cap Cat_Lg Cap,Mkt Cap Cat_Mg Cap,Mkt Cap Cat_Mic Cap,Mkt Cap Cat_Mid Cap,Mkt Cap Cat_Sm Cap,For P/E Cat_Hi,For P/E Cat_Low,For P/E Cat_Med,...,Performance (Year) (%) Cat_Down,Performance (Year) (%) Cat_Up,Employees Cat_Hi,Employees Cat_Med,Employees Cat_Sm,Volume Cat_Hi,Volume Cat_Lo,Volume Cat_Med,Var % Cat_Neg,Var % Cat_Pos
0,Buy,-1.2,0,1,0,0,0,0,0,1,...,0,1,1,0,0,1,0,0,1,0
1,Buy,5.3,1,0,0,0,0,0,0,1,...,0,1,1,0,0,0,0,1,0,1
2,Buy,-0.3,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,1,1,0
3,Buy,9.1,1,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,1
4,Hold,-36.0,0,0,0,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Buy,12.7,1,0,0,0,0,1,0,0,...,0,1,0,1,0,1,0,0,0,1
68,Hold,-68.2,0,0,0,1,0,1,0,0,...,0,1,0,0,1,1,0,0,1,0
69,Hold,5.8,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1
70,Buy,1.6,1,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,1,0,1


In [5]:
X = stock_df2.drop(columns=['Analyst Rec Cat'])
y = stock_df2['Analyst Rec Cat']

In [6]:
X.columns

Index(['Var%', 'Mkt Cap Cat_Lg Cap', 'Mkt Cap Cat_Mg Cap',
       'Mkt Cap Cat_Mic Cap', 'Mkt Cap Cat_Mid Cap', 'Mkt Cap Cat_Sm Cap',
       'For P/E Cat_Hi', 'For P/E Cat_Low', 'For P/E Cat_Med', 'Dividend_No',
       'Dividend_Yes', 'EPS growth this year (%) Cat_No',
       'EPS growth this year (%) Cat_Yes', 'EPS growth next year (%) Cat_No',
       'EPS growth next year (%) Cat_Yes',
       'EPS growth past 5 years (%) Cat_No',
       'EPS growth past 5 years (%) Cat_Yes',
       'EPS growth next 5 years (%) Cat_No',
       'EPS growth next 5 years (%) Cat_Yes',
       'Sales growth past 5 years (%) Cat_No',
       'Sales growth past 5 years (%) Cat_Yes', 'Sales Cat_Hi', 'Sales Cat_Lo',
       'Sales Cat_Med', 'Float Short (%) Cat_Hi', 'Float Short (%) Cat_Lo',
       'Float Short (%) Cat_Med', 'Profit Margin (%) Cat_Neg',
       'Profit Margin (%) Cat_Pos', 'Performance (Year) (%) Cat_Down',
       'Performance (Year) (%) Cat_Up', 'Employees Cat_Hi',
       'Employees Cat_Med', 'E

In [7]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [8]:
# create a scaler instance
scaler = StandardScaler()

In [9]:
# fit/train the scaler
X_scaler = scaler.fit(X_train)

In [10]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [12]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)


In [13]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9629629629629629
Testing Data Score: 0.8333333333333334




In [14]:
predictions = model.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,Buy,Buy
1,Buy,Hold
2,Hold,Buy
3,Buy,Buy
4,Buy,Buy
5,Buy,Buy
6,Buy,Buy
7,Buy,Buy
8,Buy,Buy
9,Buy,Hold


In [15]:
# Display the accuracy score for the test dataset.
score = balanced_accuracy_score(y_test, predictions)
print('Accuracy score: ', score)

Accuracy score:  0.4666666666666667


In [16]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,1
Actual 1,3,0


In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         Buy       0.82      0.93      0.87        15
        Hold       0.00      0.00      0.00         3

    accuracy                           0.78        18
   macro avg       0.41      0.47      0.44        18
weighted avg       0.69      0.78      0.73        18



In [18]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [19]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)
rf_results = pd.DataFrame({"Prediction": rf_predictions, "Actual": y_test}).reset_index(drop=True)
rf_results

Unnamed: 0,Prediction,Actual
0,Buy,Buy
1,Buy,Hold
2,Buy,Buy
3,Buy,Buy
4,Buy,Buy
5,Buy,Buy
6,Buy,Buy
7,Buy,Buy
8,Buy,Buy
9,Buy,Hold


In [21]:
# Display the accuracy score for the test dataset.
rf_score = balanced_accuracy_score(y_test, rf_predictions)
print('Accuracy score: ', rf_score)

Accuracy score:  0.5


In [22]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15,0
Actual 1,3,0


In [23]:
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

         Buy       0.83      1.00      0.91        15
        Hold       0.00      0.00      0.00         3

    accuracy                           0.83        18
   macro avg       0.42      0.50      0.45        18
weighted avg       0.69      0.83      0.76        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

Counter(y_resample)
y_resample.value_counts()


Buy     47
Hold    47
Name: Analyst Rec Cat, dtype: int64

In [25]:
# Creating the decision tree classifier instance
model1 = tree.DecisionTreeClassifier()
model1 = model1.fit(X_train_scaled, y_train)


In [26]:
predictions1 = model.predict(X_test_scaled)
results1 = pd.DataFrame({"Prediction": predictions1, "Actual": y_test}).reset_index(drop=True)
results1.head(10)

Unnamed: 0,Prediction,Actual
0,Buy,Buy
1,Buy,Hold
2,Hold,Buy
3,Buy,Buy
4,Buy,Buy
5,Buy,Buy
6,Buy,Buy
7,Buy,Buy
8,Buy,Buy
9,Buy,Hold


In [27]:
# Display the accuracy score for the test dataset.
score1 = balanced_accuracy_score(y_test, predictions1)
print('Accuracy score: ', score1)

Accuracy score:  0.4666666666666667


In [28]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions1)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,1
Actual 1,3,0


In [29]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

         Buy       0.82      0.93      0.87        15
        Hold       0.00      0.00      0.00         3

    accuracy                           0.78        18
   macro avg       0.41      0.47      0.44        18
weighted avg       0.69      0.78      0.73        18



In [30]:
# Create a random forest classifier
rf_model1 = RandomForestClassifier(n_estimators=500, random_state=78)

In [31]:
# Fitting the model
rf_model1 = rf_model1.fit(X_train_scaled, y_train)


In [32]:
# Making predictions using the testing data
rf_predictions1 = rf_model1.predict(X_test_scaled)
rf_results1 = pd.DataFrame({"Prediction": rf_predictions1, "Actual": y_test}).reset_index(drop=True)
rf_results1

Unnamed: 0,Prediction,Actual
0,Buy,Buy
1,Buy,Hold
2,Buy,Buy
3,Buy,Buy
4,Buy,Buy
5,Buy,Buy
6,Buy,Buy
7,Buy,Buy
8,Buy,Buy
9,Buy,Hold


In [33]:
# Display the accuracy score for the test dataset.
rf_score1 = balanced_accuracy_score(y_test, rf_predictions1)
print('Accuracy score: ', rf_score1)

Accuracy score:  0.5


In [34]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, rf_predictions1)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15,0
Actual 1,3,0


In [35]:
print(classification_report(y_test, rf_predictions1))

              precision    recall  f1-score   support

         Buy       0.83      1.00      0.91        15
        Hold       0.00      0.00      0.00         3

    accuracy                           0.83        18
   macro avg       0.42      0.50      0.45        18
weighted avg       0.69      0.83      0.76        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# import pickle

# with open("scaler.pkl", "wb") as f:
#     pickle.dump(X_scaler,f)

# with open("model.pkl", "wb") as f:
#     pickle.dump(model,f)

# with open("model1.pkl", "wb") as f:
#     pickle.dump(model1,f)