# ISLP - Chapter 5 - Exercise 7
### Author: pzuehlke

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

__7 (a):__

In [4]:
Weekly = pd.read_csv("Weekly.csv")
print(Weekly.info())
Weekly.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       1089 non-null   int64  
 1   Lag1       1089 non-null   float64
 2   Lag2       1089 non-null   float64
 3   Lag3       1089 non-null   float64
 4   Lag4       1089 non-null   float64
 5   Lag5       1089 non-null   float64
 6   Volume     1089 non-null   float64
 7   Today      1089 non-null   float64
 8   Direction  1089 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 76.7+ KB
None


Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up


In [7]:
# Converting direction to a binary variable:
Weekly["direction"] = (Weekly["Direction"] == "Up").astype(int)
Weekly.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down,0
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down,0
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up,1
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up,1
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up,1


In [10]:
y = Weekly["direction"]
X = sm.add_constant(Weekly[["Lag1", "Lag2"]])
model = sm.GLM(y, X, family=sm.families.Binomial())
results = model.fit()
print(results.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:              direction   No. Observations:                 1089
Model:                            GLM   Df Residuals:                     1086
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -744.11
Date:                Mon, 10 Feb 2025   Deviance:                       1488.2
Time:                        17:26:23   Pearson chi2:                 1.09e+03
No. Iterations:                     4   Pseudo R-squ. (CS):           0.007303
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2212      0.061      3.599      0.0

__7 (b):__

In [32]:
X = sm.add_constant(Weekly[["Lag1", "Lag2"]])
y = Weekly["direction"]

y_train = y.iloc[1:]
X_train = X.iloc[1:]

model_0 = sm.GLM(y_train, X_train, family=sm.families.Binomial())
results_0 = model.fit()
print(results_0.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:              direction   No. Observations:                 1089
Model:                            GLM   Df Residuals:                     1086
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -744.11
Date:                Mon, 10 Feb 2025   Deviance:                       1488.2
Time:                        18:20:52   Pearson chi2:                 1.09e+03
No. Iterations:                     4   Pseudo R-squ. (CS):           0.007303
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2212      0.061      3.599      0.0

__7 (c):__ No, we can see from the output below that this observation was incorrectly classified.

In [None]:
X_test_0 = X.iloc[[0]] 
y_test_0 = y.iloc[0]
prediction = results_0.predict(X_test)
print(prediction[0])  # prediction is an array having a single element
print(y_test)
print((prediction[0] > 0.5) and bool(y_test))

0.5706091879031766
0
False


__7 (d):__

In [38]:
n = len(Weekly)
y = Weekly["direction"]
X = sm.add_constant(Weekly[["Lag1", "Lag2"]])
errors = []
for i in range(n):
    X_train_i = X.drop(i)
    y_train_i = y.drop(i)
    X_test_i  = X.iloc[[i]]
    y_test_i  = y.iloc[i]

    model_i = sm.GLM(y_train_i, X_train_i, sm.families.Binomial()).fit()
    prediction_i = (model_i.predict(X_test_i) > 0.5)
    error_i = int(prediction_i.iloc[0] != y_test_i)  
    errors.append(error_i)

__7 (e):__ The error rate obtained below suggests that our model correctly
predicts the direction of the market about $ 55\% $ of the time, which is not
that much better than random guessing. On the other hand, it is quite
unreasonable to expect that it would be so easy to construct a model that
correctly predicts the market's direction, otherwise several people would
already have used it to make money, which would then be reflected in the price
and behavior of stocks, making it harder for that model to turn a profit. In
other words, the seemingly high error rate aligns with the efficient market
hypothesis.

In [41]:
LOOCV_error_rate = np.mean(np.array(errors))
print(LOOCV_error_rate)

0.44995408631772266
