### LDA, QDA, NB and KNN

In [42]:
# This code appears in every demonstration Notebook.
# By default, when you run each cell, only the last output of the codes will show.
# This code makes all outputs of a cell show.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

1. Import libraries

In [43]:
import pandas as pd
import statsmodels.api as sm
# api submodule gives access to the most commonly used 
# classes and functions directly.

2. Import Default dataset

In [44]:
Smarket = pd.read_csv('Smarket.csv')

In [45]:
Smarket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       1250 non-null   int64  
 1   Lag1       1250 non-null   float64
 2   Lag2       1250 non-null   float64
 3   Lag3       1250 non-null   float64
 4   Lag4       1250 non-null   float64
 5   Lag5       1250 non-null   float64
 6   Volume     1250 non-null   float64
 7   Today      1250 non-null   float64
 8   Direction  1250 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 88.0+ KB


In [46]:
Smarket.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [47]:
# We can code the Direction variable to 0/1 this way
Smarket['Direction_Up'] = (Smarket['Direction']=='Up').astype(int)

In [48]:
import statsmodels.api as sm

# Models in statsmodels require design matrices, which mean that the data
# need to be represented in a way that is compatible with model building

# We can use patsy library to create design matrices more easily, e.g.
# create dummy variables
import patsy

y, X = patsy.dmatrices('Direction ~ Lag1 + Lag2 +Lag3 +Lag4 +Lag5 + Volume',
                      data = Smarket,
                      return_type = 'dataframe')

In [49]:
y.head()

Unnamed: 0,Direction[Down],Direction[Up]
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0


In [50]:
X.head()

Unnamed: 0,Intercept,Lag1,Lag2,Lag3,Lag4,Lag5,Volume
0,1.0,0.381,-0.192,-2.624,-1.055,5.01,1.1913
1,1.0,0.959,0.381,-0.192,-2.624,-1.055,1.2965
2,1.0,1.032,0.959,0.381,-0.192,-2.624,1.4112
3,1.0,-0.623,1.032,0.959,0.381,-0.192,1.276
4,1.0,0.614,-0.623,1.032,0.959,0.381,1.2057


In [51]:
# Fit a single variable model - balance
logit_model_1 = sm.Logit(y['Direction[Up]'], X)
logit_res_1 =  logit_model_1.fit()
logit_res_1.summary()

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4


0,1,2,3
Dep. Variable:,Direction[Up],No. Observations:,1250.0
Model:,Logit,Df Residuals:,1243.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 27 Mar 2024",Pseudo R-squ.:,0.002074
Time:,14:33:18,Log-Likelihood:,-863.79
converged:,True,LL-Null:,-865.59
Covariance Type:,nonrobust,LLR p-value:,0.7319

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1260,0.241,-0.523,0.601,-0.598,0.346
Lag1,-0.0731,0.050,-1.457,0.145,-0.171,0.025
Lag2,-0.0423,0.050,-0.845,0.398,-0.140,0.056
Lag3,0.0111,0.050,0.222,0.824,-0.087,0.109
Lag4,0.0094,0.050,0.187,0.851,-0.089,0.107
Lag5,0.0103,0.050,0.208,0.835,-0.087,0.107
Volume,0.1354,0.158,0.855,0.392,-0.175,0.446


In [52]:
# Fit a single variable model - student
logit_model_2 = sm.Logit(y['Direction[Up]'], X[['Intercept','Lag1','Lag2']])
logit_res_2 =  logit_model_2.fit()
logit_res_2.summary()

Optimization terminated successfully.
         Current function value: 0.691361
         Iterations 4


0,1,2,3
Dep. Variable:,Direction[Up],No. Observations:,1250.0
Model:,Logit,Df Residuals:,1247.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 27 Mar 2024",Pseudo R-squ.:,0.001601
Time:,14:33:18,Log-Likelihood:,-864.2
converged:,True,LL-Null:,-865.59
Covariance Type:,nonrobust,LLR p-value:,0.2502

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0742,0.057,1.310,0.190,-0.037,0.185
Lag1,-0.0715,0.050,-1.427,0.153,-0.170,0.027
Lag2,-0.0445,0.050,-0.890,0.374,-0.142,0.054


In [53]:
# Make predictions for the dataset
predicted_p = logit_res_2.predict(X[['Intercept','Lag1','Lag2']])

In [54]:
# Classification
predicted_classes = (predicted_p >= 0.5).astype(int)
predicted_classes[:5]

0    1
1    0
2    0
3    1
4    1
dtype: int32

In [55]:
# Evaluate the classification performance

#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logit_accuracy = accuracy_score(y['Direction[Up]'], predicted_classes)
logit_conf_matrix = confusion_matrix(y['Direction[Up]'], predicted_classes)
logit_classification_report_str = classification_report(y['Direction[Up]'], predicted_classes)

In [56]:
print(logit_accuracy)
print(logit_conf_matrix)
print(logit_classification_report_str)

0.528
[[114 488]
 [102 546]]
              precision    recall  f1-score   support

         0.0       0.53      0.19      0.28       602
         1.0       0.53      0.84      0.65       648

    accuracy                           0.53      1250
   macro avg       0.53      0.52      0.46      1250
weighted avg       0.53      0.53      0.47      1250



In [57]:
# statsmodels do not have these model methods
# we use sklearn for that

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB


# Let's simplify the function names
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
gnb = GaussianNB()

# ScikitLearn automatically generate intercept when fitting. Make sure you do not have it
# in your X
# Also, y should be a Pandas series (one dimension), so we recode y here.
y = Smarket['Direction_Up']

In [65]:
# fit the models
lda_res = lda.fit(X[['Lag1', 'Lag2']],y)
qda_res = qda.fit(X[['Lag1', 'Lag2']],y)
gnb_res = gnb.fit(X[['Lag1', 'Lag2']],y)

In [66]:
# KNN models
from sklearn.neighbors import KNeighborsClassifier
# Initialize the model
knn = KNeighborsClassifier(n_neighbors=3)
# Fit the model
knn_res = knn.fit(X[['Lag1', 'Lag2']],y)

In [67]:
# Make predictions; by default, the predicted values are classification.
lda_pred = lda_res.predict(X[['Lag1', 'Lag2']])
qda_pred = qda_res.predict(X[['Lag1', 'Lag2']])
gnb_pred = gnb_res.predict(X[['Lag1', 'Lag2']])
knn_pred = knn_res.predict(X[['Lag1', 'Lag2']])

In [68]:
# lda performance
lda_accuracy = accuracy_score(y, lda_pred)
lda_conf_matrix = confusion_matrix(y, lda_pred)
print(lda_accuracy, lda_conf_matrix)

0.528 [[114 488]
 [102 546]]


In [69]:
# qda performance
qda_accuracy = accuracy_score(y, qda_pred)
qda_conf_matrix = confusion_matrix(y, qda_pred)
print(qda_accuracy, qda_conf_matrix)

0.5304 [[109 493]
 [ 94 554]]


In [70]:
# gnb performance
gnb_accuracy = accuracy_score(y, gnb_pred)
gnb_conf_matrix = confusion_matrix(y, gnb_pred)
print(gnb_accuracy, gnb_conf_matrix)

0.528 [[106 496]
 [ 94 554]]


In [71]:
# knn performance
knn_accuracy = accuracy_score(y, knn_pred)
knn_conf_matrix = confusion_matrix(y, knn_pred)
print(knn_accuracy, knn_conf_matrix)

0.752 [[428 174]
 [136 512]]
