### Preprocessing

In [1]:
# import relevant statistical packages
import numpy as np
import pandas as pd

In [2]:
# import relevant data visualisation packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [9]:
# load Default dataset
url = "/Users/arpanganguli/Documents/Professional/Finance/ISLR/Datasets/Weekly.csv"
Weekly = pd.read_csv(url, index_col = 'Unnamed: 0')

In [10]:
Weekly.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
2,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
3,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up
4,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up
5,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up


In [11]:
Weekly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1089 entries, 1 to 1089
Data columns (total 9 columns):
Year         1089 non-null int64
Lag1         1089 non-null float64
Lag2         1089 non-null float64
Lag3         1089 non-null float64
Lag4         1089 non-null float64
Lag5         1089 non-null float64
Volume       1089 non-null float64
Today        1089 non-null float64
Direction    1089 non-null object
dtypes: float64(7), int64(1), object(1)
memory usage: 85.1+ KB


In [12]:
dfX = Weekly.drop(columns='Direction')
dfy = Weekly['Direction']

In [135]:
dfX.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
1,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27
2,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576
3,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514
4,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712
5,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178


In [14]:
dfy.head()

1    Down
2    Down
3      Up
4      Up
5      Up
Name: Direction, dtype: object

### 7.a. Logistic regression model predicting Direction using Lag1 and Lag2

In [27]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [28]:
X = dfX[['Lag1', 'Lag2']]
y = dfy

In [29]:
glmfit = LogisticRegression(solver='liblinear').fit(X, y)

In [30]:
glmpred = glmfit.predict(X)

In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
conf_mat = confusion_matrix(y, glmpred)
conf_mat

array([[ 38, 446],
       [ 38, 567]])

In [33]:
round((conf_mat[0][1] + conf_mat[1][0]) / y.shape[0], 4)

0.4444

### 7.b. Logistic regression model predicting Direction using Lag1 and Lag2 *using all but the first observation*

In [108]:
X_train = dfX[['Lag1', 'Lag2']].iloc[1:]
y_train = dfy.iloc[1:]

In [118]:
X_test = np.asarray(dfX[['Lag1', 'Lag2']].iloc[0]).reshape(-1,2)
y_test = np.asarray(dfy.iloc[0])

In [119]:
y_test

array('Down', dtype='<U4')

In [111]:
glmfit = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [114]:
glmpred = glmfit.predict(X_test)
glmpred

array(['Up'], dtype=object)

### 7.c. No, prediction was 'Up' and the true Direction was 'Down'

### 7.d. Recurring logistic regression using for-loop

In [122]:
n = len(dfX)
n

1089

In [163]:
glmpred_df = pd.DataFrame()
X = dfX[['Lag1', 'Lag2']]
y = dfy
for j in range(1, n):
    X_train, y_train, X_test, y_test = X.drop([j]), y.drop([j]), np.asarray(X.iloc[j]).reshape(-1,2), np.asarray(y.iloc[j], dtype=object)
    glmfit = LogisticRegression(solver='liblinear').fit(X_train, y_train)
    glmpred = glmfit.predict(X_test)
    if glmpred == y_test:
        glmpred_df = glmpred_df.append([0])
    else:
        glmpred_df = glmpred_df.append([1])

In [169]:
glmpred_df.reset_index(drop=True,inplace=True)
glmpred_df.columns = ['Error']

In [172]:
glmpred_df.head()

Unnamed: 0,Error
0,1
1,0
2,1
3,0
4,1


In [173]:
glmpred_df.shape

(1088, 1)

In [178]:
LOOCV_estimate = glmpred_df.mean()

In [179]:
LOOCV_estimate

Error    0.444853
dtype: float64

### 7.e. LOOCV and logistic regression have the almost same error rates (0.444853 and 0.4444 respectively).