In [1]:
# Import libraries
import pandas as pd
import yfinance as yf
import statsmodels.api as sm

In [2]:
# Download historical data for required stocks.
ticker = "SPY"
data = yf.download(ticker, start="1990-01-01")
data = data.reset_index()
df = data['Adj Close'].pct_change()*100
df = df.rename('Today')
df = df.reset_index()

[*********************100%***********************]  1 of 1 completed


In [3]:
# Get the lagged return from the 5 previous trading days
for i in range(1,6):
    df['Lag'+str(i)] = df['Today'].shift(i)


In [4]:
# Add previous days volume in billions from data to df
df['Volume'] = data['Volume'].shift(1)/1000000

In [5]:
# Create the dependent variable where if daily return is greater than 0, then 1, else, 0.
df['Direction'] = [1 if df['Today'][i] > 0 else 0 for i in range(len(df['Today']))]

In [6]:
# Add a constant
df['Constant'] = 1

In [7]:
# Remove rows with NaN values
df = df.dropna()

In [8]:
# Define the independent variables set and the dependent variable
X = df[['Constant','Lag1','Lag2','Lag3','Lag4','Lag5','Volume']]
y = df['Direction']

# Create the logistic regression model
logit = sm.Logit(y,X)

# Fit the model
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.689101
         Iterations 4


In [9]:
result.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,7522.0
Model:,Logit,Df Residuals:,7515.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 20 Dec 2022",Pseudo R-squ.:,0.001757
Time:,12:54:02,Log-Likelihood:,-5183.4
converged:,True,LL-Null:,-5192.5
Covariance Type:,nonrobust,LLR p-value:,0.005649

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Constant,0.1477,0.032,4.631,0.000,0.085,0.210
Lag1,-0.0703,0.020,-3.549,0.000,-0.109,-0.031
Lag2,-0.0182,0.020,-0.917,0.359,-0.057,0.021
Lag3,-0.0273,0.020,-1.377,0.168,-0.066,0.012
Lag4,-0.0206,0.020,-1.043,0.297,-0.059,0.018
Lag5,-0.0307,0.020,-1.560,0.119,-0.069,0.008
Volume,0.0001,0.000,0.500,0.617,-0.000,0.001


In [10]:
# Predict the direction of the market
pred = result.predict(X)

In [11]:
# Evaluate using confusion matrix
def confusion_matrix(act,pred):
    # Transform the actual and predicted values
    predtrans = ['Up' if i > 0.5 else 'Down' for i in pred]
    actual = ['Up' if i > 0 else 'Down' for i in act]
    # Create a confusion matrix
    confusion_matrix = pd.crosstab(pd.Series(actual), pd.Series(predtrans), rownames=['Actual'], colnames=['Predicted'])
    # Calculate the accuracy
    accuracy = (confusion_matrix['Up']['Up'] + confusion_matrix['Down']['Down'])/len(pred)
    # Return the confusion matrix and accuracy
    return confusion_matrix, accuracy

    

In [12]:
confusion_matrix(df['Direction'],pred)


(Predicted  Down    Up
 Actual               
 Down        143  3335
 Up          120  3924,
 0.5406806700345653)