In [1]:
import pandas as pd
from collections import Counter
from pathlib import Path
import sklearn as skl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import pickle

In [2]:
stock_data = Path('StockAIClean3.csv')
stock_df = pd.read_csv(stock_data)
stock_df.head()

Unnamed: 0,No.,Ticker,Company,Sector,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,...,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Team Rec Cat,Risk,Volume Cat,Var % Cat,Var%
0,1,AAPL,Apple Inc.,Technology,Mg Cap,Med,Yes,Yes,Yes,Yes,...,Lo,Pos,Up,Hi,Buy,Buy,Lo,Hi,Neg,-1.2
1,2,ACN,Accenture plc,Technology,Lg Cap,Med,Yes,Yes,Yes,Yes,...,Lo,Pos,Up,Hi,Buy,Buy,Lo,Med,Pos,5.3
2,3,ADBE,Adobe Inc.,Technology,Lg Cap,Hi,No,Yes,Yes,Yes,...,Lo,Pos,Up,Med,Buy,Buy,Lo,Med,Neg,-0.3
3,4,ADSK,"Autodesk, Inc.",Technology,Lg Cap,Med,No,Yes,Yes,Yes,...,Lo,Pos,Down,Med,Buy,Buy,Lo,Lo,Pos,9.1
4,5,AI,"C3.ai, Inc.",Technology,Mid Cap,Hi,No,No,Yes,No,...,Hi,Neg,Up,Low,Hold,Hold,Med,Hi,Neg,-36.0


In [3]:
stock_df1 = stock_df.drop(columns=['No.', 'Ticker', 'Company', 'Sector', 'Var%'])
stock_df1.head()

Unnamed: 0,Mkt Cap Cat,For P/E Cat,Dividend,EPS growth this year (%) Cat,EPS growth next year (%) Cat,EPS growth past 5 years (%) Cat,EPS growth next 5 years (%) Cat,Sales growth past 5 years (%) Cat,Sales Cat,Float Short (%) Cat,Profit Margin (%) Cat,Performance (Year) (%) Cat,Employees Cat,Analyst Rec Cat,Team Rec Cat,Risk,Volume Cat,Var % Cat
0,Mg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Buy,Lo,Hi,Neg
1,Lg Cap,Med,Yes,Yes,Yes,Yes,Yes,Yes,Hi,Lo,Pos,Up,Hi,Buy,Buy,Lo,Med,Pos
2,Lg Cap,Hi,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Up,Med,Buy,Buy,Lo,Med,Neg
3,Lg Cap,Med,No,Yes,Yes,Yes,Yes,Yes,Med,Lo,Pos,Down,Med,Buy,Buy,Lo,Lo,Pos
4,Mid Cap,Hi,No,No,Yes,No,Yes,Yes,Lo,Hi,Neg,Up,Low,Hold,Hold,Med,Hi,Neg


In [4]:
stock_df2 = pd.get_dummies(stock_df1, columns=["Var % Cat", "Risk", "Analyst Rec Cat", "Mkt Cap Cat", "For P/E Cat", "Dividend", "EPS growth this year (%) Cat", "EPS growth next year (%) Cat", "EPS growth past 5 years (%) Cat", "EPS growth next 5 years (%) Cat", "Sales growth past 5 years (%) Cat", "Sales Cat", "Float Short (%) Cat", "Profit Margin (%) Cat", "Performance (Year) (%) Cat", "Employees Cat", "Volume Cat"])
stock_df2

Unnamed: 0,Team Rec Cat,Var % Cat_Neg,Var % Cat_Pos,Risk_Hi,Risk_Lo,Risk_Med,Analyst Rec Cat_Buy,Analyst Rec Cat_Hold,Mkt Cap Cat_Lg Cap,Mkt Cap Cat_Mg Cap,...,Profit Margin (%) Cat_Neg,Profit Margin (%) Cat_Pos,Performance (Year) (%) Cat_Down,Performance (Year) (%) Cat_Up,Employees Cat_Hi,Employees Cat_Low,Employees Cat_Med,Volume Cat_Hi,Volume Cat_Lo,Volume Cat_Med
0,Buy,1,0,0,1,0,1,0,0,1,...,0,1,0,1,1,0,0,1,0,0
1,Buy,0,1,0,1,0,1,0,1,0,...,0,1,0,1,1,0,0,0,0,1
2,Buy,1,0,0,1,0,1,0,1,0,...,0,1,0,1,0,0,1,0,0,1
3,Buy,0,1,0,1,0,1,0,1,0,...,0,1,1,0,0,0,1,0,1,0
4,Hold,1,0,0,0,1,0,1,0,0,...,1,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Hold,0,1,0,0,1,1,0,1,0,...,1,0,0,1,0,0,1,1,0,0
68,Buy,1,0,0,1,0,0,1,0,0,...,1,0,0,1,0,1,0,1,0,0
69,Sell,0,1,1,0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
70,Hold,0,1,0,0,1,1,0,1,0,...,0,1,0,1,0,0,1,0,0,1


In [5]:
X = stock_df2.drop(columns=['Team Rec Cat'])
y = stock_df2['Team Rec Cat']

In [6]:
y

0      Buy
1      Buy
2      Buy
3      Buy
4     Hold
      ... 
67    Hold
68     Buy
69    Sell
70    Hold
71    Hold
Name: Team Rec Cat, Length: 72, dtype: object

In [7]:
X.columns

Index(['Var % Cat_Neg', 'Var % Cat_Pos', 'Risk_Hi', 'Risk_Lo', 'Risk_Med',
       'Analyst Rec Cat_Buy', 'Analyst Rec Cat_Hold', 'Mkt Cap Cat_Lg Cap',
       'Mkt Cap Cat_Mg Cap', 'Mkt Cap Cat_Mic Cap', 'Mkt Cap Cat_Mid Cap',
       'Mkt Cap Cat_Sm Cap', 'For P/E Cat_Hi', 'For P/E Cat_Low',
       'For P/E Cat_Med', 'Dividend_No', 'Dividend_Yes',
       'EPS growth this year (%) Cat_No', 'EPS growth this year (%) Cat_Yes',
       'EPS growth next year (%) Cat_No', 'EPS growth next year (%) Cat_Yes',
       'EPS growth past 5 years (%) Cat_No',
       'EPS growth past 5 years (%) Cat_Yes',
       'EPS growth next 5 years (%) Cat_No',
       'EPS growth next 5 years (%) Cat_Yes',
       'Sales growth past 5 years (%) Cat_No',
       'Sales growth past 5 years (%) Cat_Yes', 'Sales Cat_Hi', 'Sales Cat_Lo',
       'Sales Cat_Med', 'Float Short (%) Cat_Hi', 'Float Short (%) Cat_Lo',
       'Float Short (%) Cat_Med', 'Profit Margin (%) Cat_Neg',
       'Profit Margin (%) Cat_Pos', 'Performanc

In [8]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, stratify=y)

In [9]:
# create a scaler instance
scaler = StandardScaler()

In [10]:
# fit/train the scaler
X_scaler = scaler.fit(X_train)

In [11]:
# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# create a logistic regression model
#model = LogisticRegression(solver='lbfgs', random_state=1)
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=5, penalty='l1', solver='saga', 
            max_iter=10000, random_state=1)
model

In [13]:
model.fit(X_train_scaled, y_train)

In [14]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0




In [15]:
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)



Unnamed: 0,Prediction,Actual
0,Hold,Hold
1,Sell,Sell
2,Buy,Buy
3,Buy,Buy
4,Hold,Hold
5,Hold,Hold
6,Hold,Hold
7,Sell,Sell
8,Buy,Buy
9,Sell,Sell


In [16]:
# Display the accuracy score for the test dataset.
score = balanced_accuracy_score(y_test, predictions)
print('Accuracy score: ', score)

Accuracy score:  1.0


In [17]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Predicted 2"]
)
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,6,0,0
Actual 1,0,7,0
Actual 2,0,0,5


In [18]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         Buy       1.00      1.00      1.00         6
        Hold       1.00      1.00      1.00         7
        Sell       1.00      1.00      1.00         5

    accuracy                           1.00        18
   macro avg       1.00      1.00      1.00        18
weighted avg       1.00      1.00      1.00        18



In [19]:
# save the model to curent directory
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [20]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
loaded_model

