In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

import pickle

In [2]:
stock_data = Path('AI_Stocks.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Sector,Industry,Market Cap,Forward P/E,Dividend,EPS (ttm),EPS growth this year (%),...,Average True Range,52-Week High (%),52-Week Low (%),Relative Strength Index (14),IPO Date,Employees,Analyst Recom,Volume,Target Price,Price
0,1,AAPL,Apple Inc.,Technology,Consumer Electronics,3035607.62,29.69,YES,5.89,8.9,...,2.79,-1.32,57.53,63.46,12/12/1980,164000,BUY,35200436,193.16,195.6
1,2,ACN,Accenture plc,Technology,Information Technology Services,215435.06,25.76,YES,11.22,17.0,...,5.7,-2.12,32.2,60.35,7/19/2001,721000,BUY,1913805,337.95,320.97
2,3,ADBE,Adobe Inc.,Technology,Software - Infrastructure,243229.33,30.88,NO,10.47,0.7,...,13.72,-0.69,99.87,73.02,8/13/1986,29239,BUY,2274261,547.43,549.1
3,4,ADSK,"Autodesk, Inc.",Technology,Software - Application,45334.7,25.38,NO,3.85,68.6,...,5.51,-9.6,18.28,54.2,6/28/1985,13700,BUY,757302,231.7,212.45
4,5,AI,"C3.ai, Inc.",Technology,Software - Application,4929.95,379.23,NO,-2.45,-33.0,...,3.36,-9.21,336.71,62.38,12/9/2020,914,HOLD,28183500,28.4,44.37


In [3]:
len(stock_df)

72

In [4]:
stock_df1 = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'Sector', 'Industry', 'IPO Date'])
stock_df1.head()

Unnamed: 0,Market Cap,Forward P/E,Dividend,EPS (ttm),EPS growth this year (%),EPS next year,EPS growth next year (%),EPS growth past 5 years (%),EPS growth next 5 years (%),Sales growth past 5 years (%),...,Performance (Year) (%),Average True Range,52-Week High (%),52-Week Low (%),Relative Strength Index (14),Employees,Analyst Recom,Volume,Target Price,Price
0,3035607.62,29.69,YES,5.89,8.9,58.31,10.15,21.6,7.86,11.5,...,20.36,2.79,-1.32,57.53,63.46,164000,BUY,35200436,193.16,195.6
1,215435.06,25.76,YES,11.22,17.0,201.96,7.42,14.5,8.64,11.2,...,4.8,5.7,-2.12,32.2,60.35,721000,BUY,1913805,337.95,320.97
2,243229.33,30.88,NO,10.47,0.7,17.8,13.04,24.5,14.07,19.2,...,33.89,13.72,-0.69,99.87,73.02,29239,BUY,2274261,547.43,549.1
3,45334.7,25.38,NO,3.85,68.6,267.96,14.92,48.5,14.38,19.5,...,-1.79,5.51,-9.6,18.28,54.2,13700,BUY,757302,231.7,212.45
4,4929.95,379.23,NO,-2.45,-33.0,78.4,140.0,-90.0,210.5,200.0,...,141.01,3.36,-9.21,336.71,62.38,914,HOLD,28183500,28.4,44.37


In [5]:
stock_df2 = pd.get_dummies(stock_df1, columns=["Dividend"])
stock_df2

Unnamed: 0,Market Cap,Forward P/E,EPS (ttm),EPS growth this year (%),EPS next year,EPS growth next year (%),EPS growth past 5 years (%),EPS growth next 5 years (%),Sales growth past 5 years (%),Sales,...,52-Week High (%),52-Week Low (%),Relative Strength Index (14),Employees,Analyst Recom,Volume,Target Price,Price,Dividend_NO,Dividend_YES
0,3035607.62,29.69,5.89,8.9,58.31,10.15,21.6,7.86,11.5,385095.0,...,-1.32,57.53,63.46,164000,BUY,35200436,193.16,195.60,0,1
1,215435.06,25.76,11.22,17.0,201.96,7.42,14.5,8.64,11.2,63550.2,...,-2.12,32.20,60.35,721000,BUY,1913805,337.95,320.97,0,1
2,243229.33,30.88,10.47,0.7,17.80,13.04,24.5,14.07,19.2,18429.0,...,-0.69,99.87,73.02,29239,BUY,2274261,547.43,549.10,1,0
3,45334.70,25.38,3.85,68.6,267.96,14.92,48.5,14.38,19.5,5104.0,...,-9.60,18.28,54.20,13700,BUY,757302,231.70,212.45,1,0
4,4929.95,379.23,-2.45,-33.0,78.40,140.00,-90.0,210.50,200.0,266.8,...,-9.21,336.71,62.38,914,HOLD,28183500,28.40,44.37,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,91411.14,50.71,-1.72,100.2,-174.06,6976.92,-10.4,23.17,32.1,33846.0,...,-5.74,103.85,55.47,32800,BUY,82176369,52.59,46.65,1,0
68,5389.45,111.42,-3.30,-191.9,629.97,266.70,-65.3,30.00,77.7,710.4,...,4.60,504.27,77.13,1875,HOLD,12501299,22.93,72.09,1,0
69,160.82,-0.10,-0.72,63.5,-46.44,88.10,35.2,100.00,59.7,145.6,...,-57.96,55.03,61.88,661,HOLD,284324,4.89,4.62,1,0
70,33087.91,35.59,4.66,76.4,360.68,16.48,18.9,10.50,3.1,2373.1,...,-4.02,40.68,51.72,7000,BUY,1165595,232.79,229.22,0,1


In [6]:
stock_df3 = pd.get_dummies(stock_df2, columns=["Analyst Recom"])
stock_df3

Unnamed: 0,Market Cap,Forward P/E,EPS (ttm),EPS growth this year (%),EPS next year,EPS growth next year (%),EPS growth past 5 years (%),EPS growth next 5 years (%),Sales growth past 5 years (%),Sales,...,52-Week Low (%),Relative Strength Index (14),Employees,Volume,Target Price,Price,Dividend_NO,Dividend_YES,Analyst Recom_BUY,Analyst Recom_HOLD
0,3035607.62,29.69,5.89,8.9,58.31,10.15,21.6,7.86,11.5,385095.0,...,57.53,63.46,164000,35200436,193.16,195.60,0,1,1,0
1,215435.06,25.76,11.22,17.0,201.96,7.42,14.5,8.64,11.2,63550.2,...,32.20,60.35,721000,1913805,337.95,320.97,0,1,1,0
2,243229.33,30.88,10.47,0.7,17.80,13.04,24.5,14.07,19.2,18429.0,...,99.87,73.02,29239,2274261,547.43,549.10,1,0,1,0
3,45334.70,25.38,3.85,68.6,267.96,14.92,48.5,14.38,19.5,5104.0,...,18.28,54.20,13700,757302,231.70,212.45,1,0,1,0
4,4929.95,379.23,-2.45,-33.0,78.40,140.00,-90.0,210.50,200.0,266.8,...,336.71,62.38,914,28183500,28.40,44.37,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,91411.14,50.71,-1.72,100.2,-174.06,6976.92,-10.4,23.17,32.1,33846.0,...,103.85,55.47,32800,82176369,52.59,46.65,1,0,1,0
68,5389.45,111.42,-3.30,-191.9,629.97,266.70,-65.3,30.00,77.7,710.4,...,504.27,77.13,1875,12501299,22.93,72.09,1,0,0,1
69,160.82,-0.10,-0.72,63.5,-46.44,88.10,35.2,100.00,59.7,145.6,...,55.03,61.88,661,284324,4.89,4.62,1,0,0,1
70,33087.91,35.59,4.66,76.4,360.68,16.48,18.9,10.50,3.1,2373.1,...,40.68,51.72,7000,1165595,232.79,229.22,0,1,1,0


In [7]:
X = stock_df3.drop(columns=['Analyst Recom_BUY'])
y = stock_df3['Analyst Recom_BUY']

In [8]:
y

0     1
1     1
2     1
3     1
4     0
     ..
67    1
68    0
69    0
70    1
71    1
Name: Analyst Recom_BUY, Length: 72, dtype: uint8

In [9]:
X.columns

Index(['Market Cap', 'Forward P/E', 'EPS (ttm)', 'EPS growth this year (%)',
       'EPS next year', 'EPS growth next year (%)',
       'EPS growth past 5 years (%)', 'EPS growth next 5 years (%)',
       'Sales growth past 5 years (%)', 'Sales', 'Float Short (%)',
       'Total Debt/Equity', 'Profit Margin (%)', 'Performance (Year) (%)',
       'Average True Range', '52-Week High (%)', '52-Week Low (%)',
       'Relative Strength Index (14)', 'Employees', 'Volume', 'Target Price',
       'Price', 'Dividend_NO', 'Dividend_YES', 'Analyst Recom_HOLD'],
      dtype='object')

In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [11]:
# create a scaler instance
X_scaler = skl.preprocessing.StandardScaler()

In [12]:
# fit/train the scaler
X_scaler.fit(X_train)

In [13]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
classifier = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier

In [15]:
classifier.fit(X_train, y_train)

In [16]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8518518518518519
Testing Data Score: 0.8888888888888888


In [17]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,0


In [18]:
# Display the accuracy score for the test dataset.
score = balanced_accuracy_score(y_test, predictions)
print('Accuracy score: ', score)

Accuracy score:  0.5


In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.89      1.00      0.94        16

    accuracy                           0.89        18
   macro avg       0.44      0.50      0.47        18
weighted avg       0.79      0.89      0.84        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

Counter(y_resample)
y_resample.value_counts()


0    46
1    46
Name: Analyst Recom_BUY, dtype: int64

In [21]:
classifier1 = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
classifier1

In [22]:
# Fit the model using the resampled training data
classifier1.fit(X_resample, y_resample)

In [23]:
# Make a prediction using the testing data
predictions1= classifier1.predict(X_test)
pd.DataFrame({"Prediction": predictions1, "Actual": y_test})

Unnamed: 0,Prediction,Actual
66,1,1
5,1,1
15,0,1
42,1,1
27,1,1
57,1,1
55,1,1
47,1,1
22,1,1
68,0,0


In [24]:
# Print the balanced_accuracy score of the model 
score1 = balanced_accuracy_score(y_test, predictions1)
print('New accuracy score: ', score1)


New accuracy score:  0.90625


In [25]:
print(classification_report(y_test, predictions1))

              precision    recall  f1-score   support

           0       0.40      1.00      0.57         2
           1       1.00      0.81      0.90        16

    accuracy                           0.83        18
   macro avg       0.70      0.91      0.73        18
weighted avg       0.93      0.83      0.86        18



In [26]:
import pickle

with open("classifier.pkl", "wb") as f:
    pickle.dump(classifier,f)

with open("classifier1.pkl", "wb") as f:
    pickle.dump(classifier1,f)

In [27]:
# # save the model to curent directory
# filename = 'classifier1.pkl'
# pickle.dump(classifier1, open(filename, 'wb'))

In [28]:
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, y_test)
# loaded_model