# 4.7 Lab: Classification Methods

## 4.7.1 The Stock Market Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import math
from patsy import dmatrices
import statsmodels.discrete.discrete_model as sm
import statsmodels.formula.api as smf
import statsmodels.api as sma
from statsmodels.graphics.regressionplots import *
from sklearn import datasets, linear_model
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import preprocessing


In [None]:
Smarket = pd.read_csv('data/Smarket.csv', header=0)

In [None]:
Smarket.head()

In [None]:
Smarket.columns

In [None]:
Smarket.shape

In [None]:
# for panda data frame, there is a method corr to compute pairwise correlation between numerical variables
Smarket.corr()
# as one would expect, the correlations between the lag variables and today’s returns are close to zero. 


In [None]:
# take a look at volume column
plt.plot(Smarket.iloc[:, 6])
# or plt.plot(Smarket[['Volume']])
plt.show()

## 4.7.2 Logistic Regression
There are some known complications that in Sklearn about applying parameter regularization. This can be aviod to set the tuning parameter 'C' to a large number. Here to be consistent with R output, I decieded to use Statsmodels.

In [None]:
y, X = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket, return_type = 'dataframe')
print(y)

In [None]:
# since we are more interested in stock marketing up, we take the second column of y as our response variable 
# we build a model to predict whether the direction will be up. 
logit = sm.Logit(y.iloc[:,1], X)
logit.fit().summary()

In [None]:
# to extract the parameters directly
logit.fit().params

In [None]:
# to extract the probability of the market going up for the first 10 instances
logit.fit().predict()[0:10] 

In [None]:
# in order to make a prediction as to whether the market will go up or down on a particular day, 
# we must convert these predicted probabilities into class labels, Up (1) or Down (0).
# we will do this by threshold the probability by a predefined threshold 
threshold = 0.5 
predict_label = pd.DataFrame(np.zeros(shape=(1250,1)), columns = ['label'])
predict_label.iloc[logit.fit().predict()>threshold] = 1

In [None]:
# we can evalue the TRAINING result by constructing a confusion matrix 
confusion_matrix(y.iloc[:,1], predict_label.iloc[:,0])

In [None]:
# the diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent incorrect predictions. 
# in this case, logistic regression correctly predicted the movement of the market 52.2% of the time.
print(np.mean(y.iloc[:,1] == predict_label.iloc[:,0]))
# or use the confusion matrix to compute the accuracy 
print(confusion_matrix(y.iloc[:,1], predict_label.iloc[:,0]).diagonal().sum()* 1.0 /confusion_matrix(y.iloc[:,1], predict_label.iloc[:,0]).sum())

### Train-Validation Split

In [None]:
# in order to better assess the accuracy of the logistic regression model in this setting, 
# we can fit the model using part of the data, and then examine how well it predicts the hold out data. 
# this will yield a more realistic error rate, in the sense that in practice we will be interested in our 
# model’s performance not on the data that we used to fit the model, but rather on days in the future for which the market’s movements are unknown.
Smarket_2005 = Smarket.query('Year >= 2005')
Smarket_train = Smarket.query('Year < 2005')

In [None]:
# we will use the training dataset to build the logistic regression model 
y_train, X_train = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket_train, return_type = 'dataframe')
y_test, X_test = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket_2005, return_type = 'dataframe')

In [None]:
logit = sm.Logit(y_train.iloc[:,1], X_train)
print(logit.fit().summary())

In [None]:
preds = logit.fit().predict(X_test)
predict_label = pd.DataFrame(np.zeros(shape=(X_test.shape[0],1)), columns = ['label'])
threshold = 0.5
mark = (preds > threshold).reset_index(drop=True)
predict_label.loc[mark] = 1
confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0])

In [None]:
# to get accuracy
np.mean(y_test.iloc[:,1].reset_index(drop=True)==predict_label.iloc[:,0].reset_index(drop=True)) 

# note: we have trained and tested our model on two completely separate data sets: 
# training was performed using only the dates before 2005, and testing was performed 
# using only the dates in 2005. Finally, we compute the predictions for 2005 and compare 
# them to the actual movements of the market over that time period. The results are rather 
# disappointing: the test error rate is 1 - 48% = 52 %, which is worse than random guessing 
# for a balanced data. Of course this result is not all that surprising, given that one 
# would not generally expect to be able to use previous days’ returns to predict future market performance.

In [None]:
# the retrain of the model with Lag1 and Lag2 will be similar to previous steps (I will be brief here). 
y_train, X_train = dmatrices('Direction~Lag1+Lag2', Smarket_train, return_type = 'dataframe')
y_test, X_test = dmatrices('Direction~Lag1+Lag2', Smarket_2005, return_type = 'dataframe')
logit = sm.Logit(y_train.iloc[:,1], X_train)
preds = logit.fit().predict(X_test)
predict_label = pd.DataFrame(np.zeros(shape=(X_test.shape[0],1)), columns = ['label'])
threshold = 0.5
confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0])
np.mean(y_test.iloc[:,1].reset_index(drop=True)==predict_label.iloc[:,0].reset_index(drop=True)) # to get accuracy on validation set

In [None]:
# another way to deal with logistics regression is to change the threshold value from 0.5 to others. 
# there is an example below with threshold 0.45. 
preds = logit.fit().predict(X_test)
predict_label = pd.DataFrame(np.zeros(shape=(X_test.shape[0],1)), columns = ['label'])
threshold = 0.45
predict_label.loc[(preds > threshold).reset_index(drop=True)] = 1
confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0])

# to get accuracy on validation set, we did see an improvment of the accuracy from 0.48 to 0.56
np.mean(y_test.iloc[:,1].reset_index(drop=True)==predict_label.iloc[:,0].reset_index(drop=True)) 

## 4.7.3 Linear Discriminant Analysis

In [None]:
# we will use sklearn's implementation of LDA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
y_train.iloc[:,1].unique()

In [None]:
# the training process 
sklearn_lda = LDA(n_components=1) #creating a LDA object
lda = sklearn_lda.fit(X_train.iloc[:,1:3], y_train.iloc[:,1]) #learning the projection matrix
X_lda = lda.transform(X_train.iloc[:,1:3]) #using the model to project X 
X_labels = lda.predict(X_train.iloc[:,1:3]) #gives you the predicted label for each sample
X_prob = lda.predict_proba(X_train.iloc[:,1:3]) #the probability of each sample to belong to each class

In [None]:
# testing step 
X_test_labels =lda.predict(X_test.iloc[:,1:3])
X_test_prob = lda.predict_proba(X_test.iloc[:,1:3]) 
print(X_test_prob[0:5,:])

In [None]:
# get the accuracy of the test set using default threshold
np.mean(y_test.iloc[:,1]==X_test_labels) 

In [None]:
# let's change the threshod a bit to see whether we can improve the accuracy. 
# the 2nd column of X_test_prob is the probability belongs to UP group. 
# the default value is 0.5, let us first check that. 
threshold = 0.5 
np.mean(y_test.iloc[:,1]==(X_test_prob[:,1]>=threshold))

In [None]:
threshold = 0.48
np.mean(y_test.iloc[:,1]==(X_test_prob[:,1]>=threshold))

## 4.7.4 Quadratic Discriminant Analysis

In [None]:
# it is a little bit of annoying that QDA and LDA have minor difference in their parameter 
# set-up and function names. 
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [None]:
sklearn_qda = QDA(priors=None,store_covariance=True) #creating a QDA object
qda = sklearn_qda.fit(X_train.iloc[:,1:3], y_train.iloc[:,1]) #learning the projection matrix
X_labels = qda.predict(X_train.iloc[:,1:3]) #gives you the predicted label for each sample
X_prob = qda.predict_proba(X_train.iloc[:,1:3]) #the probability of each sample to belong to each class

X_test_labels=qda.predict(X_test.iloc[:,1:3])
X_test_prob = qda.predict_proba(X_test.iloc[:,1:3]) 

print(np.mean(y_test.iloc[:,1]==X_test_labels) )

In [None]:
# again, use dir() to explore all the information stored in lda and qda.
dir(qda)

In [None]:
print(qda.means_)
print(qda.covariance_)

## 4.7.5 Naive Bayes

In [None]:
# from sklearn.naive_bayes import GaussianNB as NB

In [None]:
NB_class = NB()
NB_class.fit(X_train.iloc[:,1:3], y_train.iloc[:,1])
X_test_labels=NB_class.predict(X_test.iloc[:,1:3])
X_test_prob = NB_class.predict_proba(X_test.iloc[:,1:3]) 
print(np.mean(y_test.iloc[:,1]==X_test_labels))

dir(NB_class) # use dir command to check what Naive Bayes classifier has

## 4.7.6 K-Nearest Neighbors

In [None]:
# from sklearn.neighbors import KNeighborsClassifier as KNN

In [None]:
neigh = KNN(n_neighbors= 4) # use n_neighbors to change the # of tune the performance of KNN
KNN_fit = neigh.fit(X_train.iloc[:,1:3], y_train.iloc[:,1]) #learning the projection matrix
X_test_labels=KNN_fit.predict(X_test.iloc[:,1:3])
X_test_prob = KNN_fit.predict_proba(X_test.iloc[:,1:3]) 
print(np.mean(y_test.iloc[:,1]==X_test_labels))

dir(neigh) # use dir command to check what KNN offers

## 4.7.7 Possion Regression

In [None]:
Bikeshare = pd.read_csv('data/Bikeshare.csv', header=0)

In [None]:
print(Bikeshare.head())
print(Bikeshare.shape)

In [None]:
# first build a linear regression model
lm_bikeshare = smf.ols('bikers ~ mnth + hr + workingday + temp + weathersit', data = Bikeshare).fit()

In [None]:
# check the summary of the model, we may see the coefficients are different from the R output.
# the diff in the coefficients is due to the difference in the way we chose the baseline for the catergotical variables.
# here Python used April as the baseline month - probably due to the alphabetical order of the name of the month.
lm_bikeshare.summary()

In [None]:
# after building the model, we could do other things (i.e. plots, other statistics, RMSE etc.) to further explore the results. 
# here let us get a sense of the RMSE
np.sqrt(((lm_bikeshare.fittedvalues - Bikeshare.bikers)**2).sum()/len(Bikeshare.bikers))

In [None]:
# let us go ahead and build a possion regression model 
# instead of use .ols(), we use .glm()
glm_bikeshare = smf.glm('bikers ~ mnth + hr + workingday + temp + weathersit', data = Bikeshare, family=sma.families.Poisson()).fit()

In [None]:
glm_bikeshare.summary()

In [None]:
# here we do another quick look at the training RMSE 
# to judge whether model is better, we would do train/validation split and check the model performance on the validation set.
np.sqrt(((glm_bikeshare.fittedvalues - Bikeshare.bikers)**2).sum()/len(Bikeshare.bikers))

## 4.7.8 An Application to Caravan Insurance Data 
This section is removed from the 2nd edition, but keep it as a reference.

In [None]:
Caravan = pd.read_csv('data/Caravan.csv', header=0)

In [None]:
Caravan.shape

In [None]:
Caravan.head()

In [None]:
Caravan.describe()

In [None]:
"""
Scale of the variables matters in KNN ! The core question in KNN is how to define proper distance metric. 
Because the KNN classifier predicts the class of a given test observation by identifying the observations 
that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will 
have a much larger effect on the distance between the observations, and hence on the KNN classifier, 
than variables that are on a small scale. For instance, imagine a data set that contains two variables, 
salary and age (measured in dollars and years, respectively). As far as KNN is concerned, 
a difference of 1,000 in salary is enormous compared to a difference of 50 years in age. 
Consequently, salary will drive the KNN classification results, and age will have almost no effect. 
This is contrary to our intuition that a salary difference of 1, 000 is quite small compared to an age difference of 50 years. 
Furthermore, the importance of scale to the KNN classifier leads to another issue: if we measured salary in Japanese yen, 
or if we measured age in minutes, then we’d get quite different classification results from what we get 
if these two variables are measured in dollars and years. 

A good (debatable) way to handle this problem is to standardize the data so that all standardize 
variables are given a mean of zero and a standard deviation of one. Then all variables will be on a comparable scale.
The scale() function does just scale() this. In standardizing the data, we exclude column 86, 
because that is the qualitative Purchase variable.
"""

In [None]:
predict_label = pd.DataFrame(np.zeros(shape=(Caravan.shape[0],1)), columns = ['label'])
predict_label[Caravan['Purchase'] == 'Yes'] = 1
Caravan_drop = Caravan.drop(labels='Purchase', axis=1)

In [None]:
"""
I took a slightly different approach from the book. 
The training and testing data were splited by index. 
The normalization was done on the train set. 
Afterwards, the same normalization was applied to validate test. 
The code might seem wordy, but it helps clear the logical flow. 
"""

In [None]:
# I took a slightly different approach from the book. The training and testing data were splited by index. 
# the normalization was done on the train set. Afterwards, the same normalization was applied to validate test.  
# the code might seem wordy, but it helps clear the logical flow. 
train_size = 1000
train_index = range(0, train_size)
X_validate = Caravan_drop.iloc[train_index, ]
Y_validate = predict_label.iloc[train_index, ]
X_train = Caravan_drop.iloc[train_size:, ]
Y_train = predict_label.iloc[train_size:, ]


X_train_scaled = preprocessing.scale(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)
X_validate_scaled = scaler.transform(X_validate)   

In [None]:
# train with 1 neighbor 
n_neighbors = 1
neigh = KNN(n_neighbors= n_neighbors) # use n_neighbors to change the # of tune the performance of KNN
KNN_fit = neigh.fit(X_train_scaled, Y_train.iloc[:,0]) #learning the projection matrix
X_validate_labels=KNN_fit.predict(X_validate_scaled)
X_validate_prob = KNN_fit.predict_proba(X_validate_scaled) 
print(np.mean(Y_validate.iloc[:,0]==X_validate_labels))
print(confusion_matrix(Y_validate.iloc[:,0], X_validate_labels))

# the rest of this exercise considers all the trade-off between False postive and False negative.  
# the concept of accuracy is NOT always the golden metric for classification problems. 
# precision and recall, sensitivity and specificity, F1 score... are all reasonable metrics to consider. 
# we will discuss more on the concept of trainning, validation and test. 

In [None]:
# End of Chapter 4