In [None]:
import pandas as pd
import yfinance as yf
import statsmodels.api as sm

In [None]:
data = yf.download("^GSPC",start='2001-01-03')

In [None]:
data

In [None]:
df = data['Adj Close'].pct_change() * 100

In [None]:
df

In [None]:
df = df.rename("Today")

In [None]:
df

In [None]:
df = df.reset_index()

In [None]:
df

In [None]:
for i in range(1,6):
    df['Lag '+str(i)] = df['Today'].shift(i)

In [None]:
df

In [None]:
df['Volume'] = data.Volume.shift(1).values/1_000_000_000

In [None]:
df

In [None]:
df.dropna()

In [None]:
df['Direction'] = [1 if i > 0 else 0 for i in df['Today']]

In [None]:
df

In [None]:
# For the stats model we are adding a constant to the dataset
# This is required as otherwise the model does not have an intercept
df = sm.add_constant(df)

In [None]:
df = df.dropna()

In [None]:
# Independant variable
# The logistic regression model is using the Independant variables to predict the dependent varaible
# The dependent variable is the Direction

X = df[['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]
y = df['Direction']

In [None]:
# Defining our model
# Before running the model need to make sure that no values are NaN
model = sm.Logit(y,X)

In [None]:
# Now fitting the model to those variables
result = model.fit()

In [None]:
# Summary of the logistic regression
# As per the summary the lowest P value is for the Lag 1
result.summary()


In [None]:
# Making prediction with the model
prediction = result.predict(X)

In [None]:
prediction

In [None]:
y

In [None]:
# In logistic regression we need to get a binary output
# So if the prediction > 0.5 then 1 else 0
# Creating a confusion matrix that compares the actual values to the predicted values
def confusion_matrix(act,pred):
    predtrans = ['Up' if i > 0.5 else 'Down' for i in pred]
    actuals = ['Up' if i > 0 else 'Down' for i in act]
    confusion_matrix = pd.crosstab(pd.Series(actuals),
                                    pd.Series(predtrans),
                                    rownames=['Actual'],
                                    colnames=['Predicted'])
    return confusion_matrix


In [None]:
confusion_matrix(y,prediction)

In [None]:
# finding out how many times the model has predicted correctly
# Adding the diagonal values and dividing by the total observations
# This model is overly optimistic as it was trained and tested on the same set of data.
(197+2582)/len(df)

In [None]:
# So now we are splitting the data into train and test data set
# The train dataset contains data upto 2019
# The test dataset is for the 2020 and up
x_train = df[df.Date.dt.year < 2020][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]
y_train = df[df.Date.dt.year < 2020]['Direction']

x_test = df[df.Date.dt.year >= 2020][['const','Lag 1','Lag 2','Lag 3','Lag 4','Lag 5','Volume']]
y_test = df[df.Date.dt.year >= 2020]['Direction']


In [None]:
# defining the model and provide the training data in order to train the model

model = sm.Logit(y_train,x_train)


In [None]:
# now fitting the model to the training data
result = model.fit()

In [None]:
# make prediction on the Test data
prediction = result.predict(x_test)

In [None]:
# Calling the confusion matrix and passing the actual variables and the prediction
confusion_matrix(y_test,prediction)

In [None]:
# Total number of observation is
(13+173)/len(x_test)

In [None]:
# So now we are splitting the data into train and test data set
# The train dataset contains data upto 2019
# The test dataset is for the 2020 and up
x_train = df[df.Date.dt.year < 2020][['const','Lag 1','Lag 2']]
y_train = df[df.Date.dt.year < 2020]['Direction']

x_test = df[df.Date.dt.year >= 2020][['const','Lag 1','Lag 2']]
y_test = df[df.Date.dt.year >= 2020]['Direction']

In [None]:
model = sm.Logit(y_train,x_train)
result = model.fit()
prediction = result.predict(x_test)
confusion_matrix(y_test,prediction)

In [None]:
(20+170)/len(x_test)