In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

import pickle

In [2]:
stock_data = Path('AI_Stocks.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Market Cap,Forward P/E,EPS growth next year,EPS growth past 5 years,EPS growth next 5 years,Sales growth past 5 years,Sales,Total Debt/Equity,Profit Margin,Average True Range,IPO Date,Employees,Analyst Recom,Volume,Target Price,Price
0,1,AAPL,Apple Inc.,3048721.25,29.81,10.15,21.6,7.86,11.5,385095.0,1.76,24.5,2.9,12/12/1980,164000,BUY,38786913,193.16,196.45
1,2,ACN,Accenture plc,212334.12,25.39,7.42,14.5,8.64,11.2,63550.2,0.0,11.3,5.59,7/19/2001,721000,BUY,1351376,337.95,316.35
2,3,ADBE,Adobe Inc.,241931.46,30.71,13.04,24.5,14.07,19.2,18429.0,0.24,26.3,13.98,8/13/1986,29239,BUY,4060003,547.43,546.17
3,4,ADSK,"Autodesk, Inc.",45236.55,25.32,14.92,48.5,14.38,19.5,5104.0,0.0,16.4,5.61,6/28/1985,13700,BUY,1302035,231.7,211.99
4,5,AI,"C3.ai, Inc.",4666.62,358.97,140.0,35.2,12.13,27.3,266.8,0.0,9.05,3.26,12/9/2020,914,HOLD,18445319,28.4,42.0


In [3]:
stock_df = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'IPO Date'])
stock_df.head()

Unnamed: 0,Market Cap,Forward P/E,EPS growth next year,EPS growth past 5 years,EPS growth next 5 years,Sales growth past 5 years,Sales,Total Debt/Equity,Profit Margin,Average True Range,Employees,Analyst Recom,Volume,Target Price,Price
0,3048721.25,29.81,10.15,21.6,7.86,11.5,385095.0,1.76,24.5,2.9,164000,BUY,38786913,193.16,196.45
1,212334.12,25.39,7.42,14.5,8.64,11.2,63550.2,0.0,11.3,5.59,721000,BUY,1351376,337.95,316.35
2,241931.46,30.71,13.04,24.5,14.07,19.2,18429.0,0.24,26.3,13.98,29239,BUY,4060003,547.43,546.17
3,45236.55,25.32,14.92,48.5,14.38,19.5,5104.0,0.0,16.4,5.61,13700,BUY,1302035,231.7,211.99
4,4666.62,358.97,140.0,35.2,12.13,27.3,266.8,0.0,9.05,3.26,914,HOLD,18445319,28.4,42.0


In [4]:
y = stock_df['Analyst Recom']
X = stock_df.drop(columns=['Analyst Recom'])

In [5]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(29, 14)

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [7]:
classifier.fit(X_train, y_train)

In [8]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9310344827586207
Testing Data Score: 0.9


In [9]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,BUY,BUY
1,BUY,BUY
2,BUY,HOLD
3,BUY,BUY
4,BUY,BUY
5,BUY,BUY
6,BUY,BUY
7,BUY,BUY
8,BUY,BUY
9,BUY,BUY


In [10]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9

In [13]:
# Create and save the confusion matrix for the training data
cm = confusion_matrix(y_test, predictions)

# Print the confusion matrix for the training data
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted0", "Predicted 1"]
)

display(cm_df)

Unnamed: 0,Predicted0,Predicted 1
Actual 0,9,0
Actual 1,1,0


In [14]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         BUY       0.90      1.00      0.95         9
        HOLD       0.00      0.00      0.00         1

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_model = RandomOverSampler(random_state=1)


# Fit the original training data to the random_oversampler model
x_resample, y_resample = random_model.fit_resample(X_train, y_train)

In [16]:
# Count the distinct values of the resampled labels data
y_resample.value_counts()

BUY     25
HOLD    25
Name: Analyst Recom, dtype: int64

In [17]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier1 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier1.fit(x_resample, y_resample)

# Make a prediction using the testing data
predictions1= classifier1.predict(X_test)
pd.DataFrame({"Prediction": predictions1, "Actual": y_test})

Unnamed: 0,Prediction,Actual
18,BUY,BUY
17,BUY,BUY
16,BUY,HOLD
10,HOLD,BUY
12,BUY,BUY
6,BUY,BUY
0,BUY,BUY
35,BUY,BUY
13,BUY,BUY
9,BUY,BUY


In [18]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, predictions1)

0.4444444444444444

In [19]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions1)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,1
Actual 1,1,0


In [20]:
# Print the classification report for the model
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

         BUY       0.89      0.89      0.89         9
        HOLD       0.00      0.00      0.00         1

    accuracy                           0.80        10
   macro avg       0.44      0.44      0.44        10
weighted avg       0.80      0.80      0.80        10

