In [297]:
# import relevant statistical packages
import numpy as np
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import sklearn.linear_model as skl
import sklearn.metrics as metrics

In [298]:
# import relevant data visualisation tools
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [299]:
# import data
url = "/Users/arpanganguli/Documents/Finance/ISLR/Datasets/Smarket.csv"
Smarket = pd.read_csv(url, index_col = 'SlNo')

In [300]:
# explore data
print(Smarket.head())

      Year   Lag1   Lag2   Lag3   Lag4   Lag5  Volume  Today Direction
SlNo                                                                  
1     2001  0.381 -0.192 -2.624 -1.055  5.010  1.1913  0.959        Up
2     2001  0.959  0.381 -0.192 -2.624 -1.055  1.2965  1.032        Up
3     2001  1.032  0.959  0.381 -0.192 -2.624  1.4112 -0.623      Down
4     2001 -0.623  1.032  0.959  0.381 -0.192  1.2760  0.614        Up
5     2001  0.614 -0.623  1.032  0.959  0.381  1.2057  0.213        Up


In [301]:
print(Smarket.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 1 to 1250
Data columns (total 9 columns):
Year         1250 non-null int64
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(7), int64(1), object(1)
memory usage: 97.7+ KB
None


**Using Statsmodels Logit method**

In [302]:
X = np.array(Smarket[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]) # Statsmodels requires the variables to be arrays
X1 = sm.add_constant(X)

In [303]:
Y = pd.get_dummies(Smarket['Direction'], drop_first = True) # Direction is in string format. I am converting it into a dummy format
Y1 = np.array(Y)

In [309]:
model_1 = sm.Logit(Y1, X1).fit()

Optimization terminated successfully.
         Current function value: 2.197001
         Iterations 4


In [310]:
model_1.summary()

  return 1 - self.llf/self.llnull


0,1,2,3
Dep. Variable:,y,No. Observations:,1250.0
Model:,Logit,Df Residuals:,1243.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 07 Jan 2019",Pseudo R-squ.:,inf
Time:,20:53:17,Log-Likelihood:,-2746.3
converged:,True,LL-Null:,0.0
,,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1260,0.241,-0.523,0.601,-0.598,0.346
x1,-0.0731,0.050,-1.457,0.145,-0.171,0.025
x2,-0.0423,0.050,-0.845,0.398,-0.140,0.056
x3,0.0111,0.050,0.222,0.824,-0.087,0.109
x4,0.0094,0.050,0.187,0.851,-0.089,0.107
x5,0.0103,0.050,0.208,0.835,-0.087,0.107
x6,0.1354,0.158,0.855,0.392,-0.175,0.446


In [312]:
model_1.pred_table()

array([[145., 457.],
       [141., 507.]])

**Another method through Statsmodels (my personal favourite)**

In [313]:
X2 = Smarket[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
Y2 = Smarket['Direction']

In [314]:
model_2 = sm.formula.glm(formula = "Y2~X2", data=Smarket, family=sm.families.Binomial()).fit()

In [315]:
model_2.summary()

0,1,2,3
Dep. Variable:,"['Y2[Down]', 'Y2[Up]']",No. Observations:,1250
Model:,GLM,Df Residuals:,1243
Model Family:,Binomial,Df Model:,6
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-863.79
Date:,"Mon, 07 Jan 2019",Deviance:,1727.6
Time:,20:53:38,Pearson chi2:,1.25e+03
No. Iterations:,4,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1260,0.241,0.523,0.601,-0.346,0.598
X2[0],0.0731,0.050,1.457,0.145,-0.025,0.171
X2[1],0.0423,0.050,0.845,0.398,-0.056,0.140
X2[2],-0.0111,0.050,-0.222,0.824,-0.109,0.087
X2[3],-0.0094,0.050,-0.187,0.851,-0.107,0.089
X2[4],-0.0103,0.050,-0.208,0.835,-0.107,0.087
X2[5],-0.1354,0.158,-0.855,0.392,-0.446,0.175


In [316]:
print(model_2.params)

Intercept    0.126000
X2[0]        0.073074
X2[1]        0.042301
X2[2]       -0.011085
X2[3]       -0.009359
X2[4]       -0.010313
X2[5]       -0.135441
dtype: float64


In [317]:
y_pred_2 = model.predict(X3)

In [318]:
print(y_pred_2.head())

SlNo
1    0.492916
2    0.518532
3    0.518861
4    0.484778
5    0.489219
dtype: float64


In [321]:
for i in y_pred_2:
    if (i > 0.5):
        y_pred_2.replace(to_replace=i, value="Up", inplace=True)
    else:
        y_pred_2.replace(to_replace=i, value="Down", inplace=True)

In [322]:
print(y_pred_2.reset_index(drop = True).head())

0    Down
1      Up
2      Up
3    Down
4    Down
dtype: object


In [323]:
from sklearn.metrics import confusion_matrix

In [325]:
conf_mat_2 = confusion_matrix(Y2, y_pred_2)
print(conf_mat)

[[457 145]
 [507 141]]


In [326]:
accuracy_2 = (conf_mat_2[0,0] + conf_mat_2[1,1]) / conf_mat_2.sum()
print(accuracy_2)

0.4784


In [327]:
from sklearn.model_selection import train_test_split

In [328]:
X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.2016)

In [329]:
data_3 = pd.concat([pd.DataFrame(y_train), X_train], axis = 1)

In [330]:
print(data_3.head())

     Direction   Lag1   Lag2   Lag3   Lag4   Lag5  Volume
SlNo                                                     
409       Down -1.813 -1.385  0.754 -2.269  1.405  1.1466
21        Down -0.841 -0.151  0.359 -1.747  0.546  1.1583
177         Up -0.517  0.879  3.898 -1.903 -3.106  1.5191
1014        Up -0.353 -0.641 -0.778 -0.949  0.967  1.4946
287       Down  1.451 -0.667  1.949  2.263 -0.285  1.5413


In [331]:
model_3 = sm.formula.glm(formula = 'Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', data = data_3,  family = sm.families.Binomial()).fit()

In [332]:
print(model_3.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                  998
Model:                                              GLM   Df Residuals:                      991
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -687.62
Date:                                  Mon, 07 Jan 2019   Deviance:                       1375.2
Time:                                          20:57:33   Pearson chi2:                     998.
No. Iterations:                                       4   Covariance Type:             nonrobust
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------

In [334]:
y_pred_3 = model_3.predict(X_test)

In [335]:
for j in y_pred_3:
    if (j > 0.5):
        y_pred_3.replace(to_replace=j, value="Up", inplace=True)
    else:
        y_pred_3.replace(to_replace=j, value="Down", inplace=True)

In [336]:
from sklearn.metrics import confusion_matrix

In [337]:
conf_mat_3 = confusion_matrix(y_test, y_pred_3)

In [338]:
print(conf_mat_3)

[[102  25]
 [ 91  34]]


In [339]:
accuracy_3 = (conf_mat_3[0, 0] + conf_mat_3[1, 1]) / conf_mat_3.sum()

In [341]:
print(accuracy_3)

0.5396825396825397


**There is an improvement in the accuracy of the model. Now, if we just take just Lag4 and Lag5 as the independent variables (since they 
have the lowest p-values) and perform logistic regression and check if the new model indeed improves the accuracy or not.**

In [345]:
X4 = Smarket[['Lag4', 'Lag5']]
Y4 = Smarket['Direction']

In [346]:
data_4 = pd.concat([ pd.DataFrame(Y4), X4], axis = 1)

In [347]:
print(data_4.head())

     Direction   Lag4   Lag5
SlNo                        
1           Up -1.055  5.010
2           Up -2.624 -1.055
3         Down -0.192 -2.624
4           Up  0.381 -0.192
5           Up  0.959  0.381


In [348]:
from sklearn.model_selection import train_test_split

In [349]:
X_train, X_test, y_train, y_test = train_test_split(data_4.drop(columns = ['Direction']), data_4.Direction, test_size=0.2016)

In [350]:
model_4 = sm.formula.glm(formula = "Direction~Lag4+Lag5", data = data_4, family = sm.families.Binomial()).fit()

In [351]:
print(model_4.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1247
Model Family:                                  Binomial   Df Model:                            2
Link Function:                                    logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -865.56
Date:                                  Mon, 07 Jan 2019   Deviance:                       1731.1
Time:                                          21:00:54   Pearson chi2:                 1.25e+03
No. Iterations:                                       4   Covariance Type:             nonrobust
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------

In [352]:
y_pred_4 = model_4.predict(X_test)

In [353]:
print(y_pred_4.head())

SlNo
271    0.483735
790    0.479764
877    0.483938
389    0.479740
452    0.482910
dtype: float64


In [354]:
for k in y_pred_4:
    if (k > 0.5):
        y_pred_4.replace(to_replace=k, value="Up", inplace=True)
    else:
        y_pred_4.replace(to_replace=k, value="Down", inplace=True )

In [355]:
print(y_pred_4.head())

SlNo
271    Down
790    Down
877    Down
389    Down
452    Down
dtype: object


In [356]:
from sklearn.metrics import confusion_matrix

In [357]:
conf_int_4 = confusion_matrix(y_test, y_pred_4)

In [358]:
print(conf_int_4)

[[119   0]
 [133   0]]


In [359]:
accuracy_4 = (conf_int_4[0, 0] + conf_int_4[1, 1]) / conf_int_4.sum()

In [360]:
print(accuracy_4)

0.4722222222222222


**This is potentially worrying given the accuracy of the model decreases and the model is unable to assess any false positives (banks
usually prioritise the detection of false positives).**