In [1]:
# ISLR material, Original R to Python adaptation by Jordi Warmenhoven
# Subsequent Adaptation from http://www.science.smith.edu/~jcrouser/SDS293/
# Further simplifications and adaptions done above those.

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

  from pandas.core import datetools


### Logistic Regression

In [3]:
df = pd.read_csv('Smarket.csv', index_col=0, parse_dates=True)
df.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
import statsmodels.formula.api as smf

In [5]:
formula = 'Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume'

In [6]:
model = smf.glm(formula = formula, data=df, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1243
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    logit   Scale:                             1.0
Method:                                            IRLS   Log-Likelihood:                -863.79
Date:                                  Sat, 19 May 2018   Deviance:                       1727.6
Time:                                          11:47:25   Pearson chi2:                 1.25e+03
No. Iterations:                                       4                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------

In [7]:
print("Coefficeients")
print(result.params)
print()
print("p-Values")
print(result.pvalues)
print()
print("Dependent variables")
print(result.model.endog_names)

Coefficeients
Intercept    0.126000
Lag1         0.073074
Lag2         0.042301
Lag3        -0.011085
Lag4        -0.009359
Lag5        -0.010313
Volume      -0.135441
dtype: float64

p-Values
Intercept    0.600700
Lag1         0.145232
Lag2         0.398352
Lag3         0.824334
Lag4         0.851445
Lag5         0.834998
Volume       0.392404
dtype: float64

Dependent variables
['Direction[Down]', 'Direction[Up]']


In [8]:
predictions = result.predict()
print(predictions[0:10])

[ 0.49291587  0.51853212  0.51886117  0.48477764  0.48921884  0.49304354
  0.50734913  0.49077084  0.48238647  0.51116222]


In [9]:
print(np.column_stack((df.as_matrix(columns = ["Direction"]).flatten(), 
                       result.model.endog)))

[['Up' 0.0]
 ['Up' 0.0]
 ['Down' 1.0]
 ..., 
 ['Up' 0.0]
 ['Down' 1.0]
 ['Down' 1.0]]


In [10]:
predictions_nominal = [ "Up" if x < 0.5 else "Down" for x in predictions]

In [11]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(df["Direction"], 
                       predictions_nominal))

[[145 457]
 [141 507]]


In [12]:
print(classification_report(df["Direction"], 
                            predictions_nominal, 
                            digits = 3))

             precision    recall  f1-score   support

       Down      0.507     0.241     0.327       602
         Up      0.526     0.782     0.629       648

avg / total      0.517     0.522     0.483      1250



In [13]:
index_train = df['Year'] <= 2004
x_train = df[index_train][:]
y_train = df[index_train]['Direction']

index_test = df['Year'] >= 2005
x_test = df[index_test][:]
y_test = df[index_test]['Direction']

In [14]:
model = smf.glm(formula = formula, 
                data = x_train, 
                family = sm.families.Binomial())
result = model.fit()

In [15]:
predictions = result.predict(x_test)
predictions_nominal = [ "Up" if x < 0.5 else "Down" for x in predictions]
print(classification_report(y_test, 
                            predictions_nominal, 
                            digits = 3))

             precision    recall  f1-score   support

       Down      0.443     0.694     0.540       111
         Up      0.564     0.312     0.402       141

avg / total      0.511     0.480     0.463       252

