In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

import pickle

In [2]:
stock_data = Path('AI_Stocks.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Market Cap,Forward P/E,EPS growth next year,EPS growth past 5 years,EPS growth next 5 years,Sales growth past 5 years,Sales,Total Debt/Equity,Profit Margin,Average True Range,IPO Date,Employees,Analyst Recom,Volume,Target Price,Price
0,1,AAPL,Apple Inc.,3048721.25,29.81,10.15,21.6,7.86,11.5,385095.0,1.76,24.5,2.9,12/12/1980,164000,BUY,38786913,193.16,196.45
1,2,ACN,Accenture plc,212334.12,25.39,7.42,14.5,8.64,11.2,63550.2,0.0,11.3,5.59,7/19/2001,721000,BUY,1351376,337.95,316.35
2,3,ADBE,Adobe Inc.,241931.46,30.71,13.04,24.5,14.07,19.2,18429.0,0.24,26.3,13.98,8/13/1986,29239,BUY,4060003,547.43,546.17
3,4,ADSK,"Autodesk, Inc.",45236.55,25.32,14.92,48.5,14.38,19.5,5104.0,0.0,16.4,5.61,6/28/1985,13700,BUY,1302035,231.7,211.99
4,5,AI,"C3.ai, Inc.",4666.62,358.97,140.0,35.2,12.13,27.3,266.8,0.0,9.05,3.26,12/9/2020,914,HOLD,18445319,28.4,42.0


In [3]:
stock_df1 = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'IPO Date'])
stock_df1.head()

Unnamed: 0,Market Cap,Forward P/E,EPS growth next year,EPS growth past 5 years,EPS growth next 5 years,Sales growth past 5 years,Sales,Total Debt/Equity,Profit Margin,Average True Range,Employees,Analyst Recom,Volume,Target Price,Price
0,3048721.25,29.81,10.15,21.6,7.86,11.5,385095.0,1.76,24.5,2.9,164000,BUY,38786913,193.16,196.45
1,212334.12,25.39,7.42,14.5,8.64,11.2,63550.2,0.0,11.3,5.59,721000,BUY,1351376,337.95,316.35
2,241931.46,30.71,13.04,24.5,14.07,19.2,18429.0,0.24,26.3,13.98,29239,BUY,4060003,547.43,546.17
3,45236.55,25.32,14.92,48.5,14.38,19.5,5104.0,0.0,16.4,5.61,13700,BUY,1302035,231.7,211.99
4,4666.62,358.97,140.0,35.2,12.13,27.3,266.8,0.0,9.05,3.26,914,HOLD,18445319,28.4,42.0


In [4]:
stock_df2 = pd.get_dummies(stock_df1, columns=["Analyst Recom"])
stock_df2

Unnamed: 0,Market Cap,Forward P/E,EPS growth next year,EPS growth past 5 years,EPS growth next 5 years,Sales growth past 5 years,Sales,Total Debt/Equity,Profit Margin,Average True Range,Employees,Volume,Target Price,Price,Analyst Recom_BUY,Analyst Recom_HOLD
0,3048721.25,29.81,10.15,21.6,7.86,11.5,385095.0,1.76,24.5,2.9,164000,38786913,193.16,196.45,1,0
1,212334.12,25.39,7.42,14.5,8.64,11.2,63550.2,0.0,11.3,5.59,721000,1351376,337.95,316.35,1,0
2,241931.46,30.71,13.04,24.5,14.07,19.2,18429.0,0.24,26.3,13.98,29239,4060003,547.43,546.17,1,0
3,45236.55,25.32,14.92,48.5,14.38,19.5,5104.0,0.0,16.4,5.61,13700,1302035,231.7,211.99,1,0
4,4666.62,358.97,140.0,35.2,12.13,27.3,266.8,0.0,9.05,3.26,914,18445319,28.4,42.0,0,1
5,181192.44,26.99,48.95,21.9,9.87,35.1,23067.0,0.0,1.7,4.21,25000,54041667,133.65,114.4,1,0
6,1330520.98,51.77,62.19,-26.0,9.09,23.6,524897.0,0.56,0.8,3.17,1541000,41832022,143.17,133.68,1,0
7,29304.29,35.36,11.63,13.6,8.3,13.5,2149.9,0.0,25.7,7.19,5600,398081,338.36,342.1,1,0
8,290876.78,29.58,14.1,24.2,21.67,18.8,28372.8,0.0,28.7,19.53,38866,789125,758.86,716.41,1,0
9,368275.77,19.91,7.61,45.8,10.6,13.5,35042.0,1.79,38.7,19.55,20000,1495229,880.43,898.65,1,0


In [5]:
X = stock_df2.drop(columns=['Analyst Recom_BUY'])
y = stock_df2['Analyst Recom_BUY']

In [6]:
y

0     1
1     1
2     1
3     1
4     0
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    0
17    1
18    1
19    1
20    1
21    0
22    1
23    1
24    1
25    1
26    1
27    0
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    0
38    1
Name: Analyst Recom_BUY, dtype: uint8

In [7]:
X.columns

Index(['Market Cap', 'Forward P/E', 'EPS growth next year',
       'EPS growth past 5 years', 'EPS growth next 5 years',
       'Sales growth past 5 years', 'Sales', 'Total Debt/Equity',
       'Profit Margin', 'Average True Range', 'Employees', 'Volume',
       'Target Price', 'Price', 'Analyst Recom_HOLD'],
      dtype='object')

In [8]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [9]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [10]:
# fit/train the scaler
X_scaler.fit(X_train)

In [11]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
classifier = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier

In [13]:
classifier.fit(X_train, y_train)



In [14]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8620689655172413
Testing Data Score: 0.9


In [15]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,0
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [16]:
# Display the accuracy score for the test dataset.
score = balanced_accuracy_score(y_test, predictions)
print('Accuracy score: ', score)

Accuracy score:  0.5


In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      1.00      0.95         9

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

Counter(y_resample)
y_resample.value_counts()


1    25
0    25
Name: Analyst Recom_BUY, dtype: int64

In [19]:
classifier1 = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier1

In [20]:
# Fit the model using the resampled training data
classifier1.fit(X_resample, y_resample)

In [21]:
# Make a prediction using the testing data
predictions1= classifier1.predict(X_test)
pd.DataFrame({"Prediction": predictions1, "Actual": y_test})

Unnamed: 0,Prediction,Actual
18,1,1
1,1,1
0,1,1
17,1,1
27,0,0
8,1,1
12,1,1
6,1,1
13,0,1
10,0,1


In [22]:
# Print the balanced_accuracy score of the model 
score1 = balanced_accuracy_score(y_test, predictions1)
print('New accuracy score: ', score1)


New accuracy score:  0.8888888888888888


In [23]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.78      0.88         9

    accuracy                           0.80        10
   macro avg       0.67      0.89      0.69        10
weighted avg       0.93      0.80      0.84        10



In [24]:
# save the model to curent directory
filename = 'classifier1.pkl'
pickle.dump(classifier1, open(filename, 'wb'))

In [25]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
loaded_model