# 4. Classification

In [32]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.appName('classification').getOrCreate()
sc = SparkContext.getOrCreate()

import pyspark.sql.functions as F
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.mllib.evaluation import MulticlassMetrics

from utils import prepare_data, estimate_correlation_matrix, get_GLM_modelSummary, predict_binary_category_based_on_threshold

import pandas as pd

In [36]:
%load_ext autoreload

## *4.6.1 The Stock Market Data*

In [33]:
# -> Load Smarket dataset:

Smarket = spark.read.csv('data/Smarket.csv',header=True,inferSchema=True)

print('Smarket dataset:'); Smarket.show(5)
print('Data types:'); Smarket.printSchema()

Smarket dataset:
+---+----+------+------+------+------+------+------+------+---------+
|_c0|Year|  Lag1|  Lag2|  Lag3|  Lag4|  Lag5|Volume| Today|Direction|
+---+----+------+------+------+------+------+------+------+---------+
|  1|2001| 0.381|-0.192|-2.624|-1.055|  5.01|1.1913| 0.959|       Up|
|  2|2001| 0.959| 0.381|-0.192|-2.624|-1.055|1.2965| 1.032|       Up|
|  3|2001| 1.032| 0.959| 0.381|-0.192|-2.624|1.4112|-0.623|     Down|
|  4|2001|-0.623| 1.032| 0.959| 0.381|-0.192| 1.276| 0.614|       Up|
|  5|2001| 0.614|-0.623| 1.032| 0.959| 0.381|1.2057| 0.213|       Up|
+---+----+------+------+------+------+------+------+------+---------+
only showing top 5 rows

Data types:
root
 |-- _c0: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Lag1: double (nullable = true)
 |-- Lag2: double (nullable = true)
 |-- Lag3: double (nullable = true)
 |-- Lag4: double (nullable = true)
 |-- Lag5: double (nullable = true)
 |-- Volume: double (nullable = true)
 |-- Today: double (

## *4.6.2 Logistic Regression*

In [34]:
# -> Estimate Pearson's correlation matrix:

cols = ['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today']
estimate_correlation_matrix(Smarket, cols, method='pearson', round_decimals=2)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.03,0.03,0.03,0.04,0.03,0.54,0.03
Lag1,0.03,1.0,-0.03,-0.01,-0.0,-0.01,0.04,-0.03
Lag2,0.03,-0.03,1.0,-0.03,-0.01,-0.0,-0.04,-0.01
Lag3,0.03,-0.01,-0.03,1.0,-0.02,-0.02,-0.04,-0.0
Lag4,0.04,-0.0,-0.01,-0.02,1.0,-0.03,-0.05,-0.01
Lag5,0.03,-0.01,-0.0,-0.02,-0.03,1.0,-0.02,-0.03
Volume,0.54,0.04,-0.04,-0.04,-0.05,-0.02,1.0,0.01
Today,0.03,-0.03,-0.01,-0.0,-0.01,-0.03,0.01,1.0


## 4.6.2. *Logistic Regression*

In [39]:
# -> Prepare the dataset

data = prepare_data(df = Smarket,
                    labelCol = 'Direction',
                    label_is_categorical = True,
                    categoricalCols = [],
                    continuousCols = ['Year',  'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today']
                   )
print('Data:'); data.show(5)

# -> Define the model:

glr = GeneralizedLinearRegression(family="binomial", 
                                  link="Logit", 
                                  featuresCol='features', 
                                  labelCol='label')

# -> Fit the model:

model = glr.fit(data)

# -> Model's summary:

get_GLM_modelSummary(model)

# -> Predictions

predictions = predict_binary_category_based_on_threshold(data, model, threshold=0.5)
print('Predictions:'); predictions.show(5)

# -> Confusion matrix:

metrics = MulticlassMetrics(predictions.select('label','predicted_category').rdd)

print('Confusion Matrix:')
confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray())
print(confusion_matrix)
print('Accuracy = {}'.format(metrics.accuracy))

Data:
+---+----+------+------+------+------+------+------+------+---------+--------------------+-----+
|_c0|Year|  Lag1|  Lag2|  Lag3|  Lag4|  Lag5|Volume| Today|Direction|            features|label|
+---+----+------+------+------+------+------+------+------+---------+--------------------+-----+
|  1|2001| 0.381|-0.192|-2.624|-1.055|  5.01|1.1913| 0.959|       Up|[2001.0,0.381,-0....|  0.0|
|  2|2001| 0.959| 0.381|-0.192|-2.624|-1.055|1.2965| 1.032|       Up|[2001.0,0.959,0.3...|  0.0|
|  3|2001| 1.032| 0.959| 0.381|-0.192|-2.624|1.4112|-0.623|     Down|[2001.0,1.032,0.9...|  1.0|
|  4|2001|-0.623| 1.032| 0.959| 0.381|-0.192| 1.276| 0.614|       Up|[2001.0,-0.623,1....|  0.0|
|  5|2001| 0.614|-0.623| 1.032| 0.959| 0.381|1.2057| 0.213|       Up|[2001.0,0.614,-0....|  0.0|
+---+----+------+------+------+------+------+------+------+---------+--------------------+-----+
only showing top 5 rows

Results
## -------------------------------------------------
##  Estimate | Std.Error | t Values

In [45]:
## -> Example with train and test samples

# -> Split into train and test samples:

train = data.filter(F.col('Year')<2005)
Smarket_2005 = data.filter(F.col('Year')>=2005)

# -> Define the model:

model = GeneralizedLinearRegression(family="binomial", link="Logit", featuresCol='features', labelCol='label')

# alternative way to define a logistic regression model:
# model = LogisticRegression(featuresCol='features', labelCol='label')


# -> Fit the model
model_fit = model.fit(train)

# -> Predictions 

train_predictions = predict_binary_category_based_on_threshold(train, model_fit, threshold=0.5)
print('Train sample predictions:'); train_predictions.show(5)

test_predictions = predict_binary_category_based_on_threshold(Smarket_2005, model_fit, threshold=0.5)
print('\nTrain sample predictions:'); train_predictions.show(5)


# -> Confusion matrices and estimated accuracy values for train and test samples:

print('Train sample confusion matrix:')
metrics = MulticlassMetrics(train_predictions.select('label','predicted_category').rdd)
train_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray())

print(train_confusion_matrix)
print('\nTrain sample accuracy = {:.3f}'.format(metrics.accuracy))

print('\nTrain sample confusion matrix:')
metrics = MulticlassMetrics(test_predictions.select('label','predicted_category').rdd)
test_confusion_matrix = pd.DataFrame(metrics.confusionMatrix().toArray())

print(test_confusion_matrix)
print('\nTest sample accuracy = {:.3f}'.format(metrics.accuracy))

Train sample predictions:
+---+----+------+------+------+------+------+------+------+---------+--------------------+-----+------------------+------------------+
|_c0|Year|  Lag1|  Lag2|  Lag3|  Lag4|  Lag5|Volume| Today|Direction|            features|label|        prediction|predicted_category|
+---+----+------+------+------+------+------+------+------+---------+--------------------+-----+------------------+------------------+
|  1|2001| 0.381|-0.192|-2.624|-1.055|  5.01|1.1913| 0.959|       Up|[2001.0,0.381,-0....|  0.0|           1.0E-16|               0.0|
|  2|2001| 0.959| 0.381|-0.192|-2.624|-1.055|1.2965| 1.032|       Up|[2001.0,0.959,0.3...|  0.0|           1.0E-16|               0.0|
|  3|2001| 1.032| 0.959| 0.381|-0.192|-2.624|1.4112|-0.623|     Down|[2001.0,1.032,0.9...|  1.0|0.9999999999999999|               1.0|
|  4|2001|-0.623| 1.032| 0.959| 0.381|-0.192| 1.276| 0.614|       Up|[2001.0,-0.623,1....|  0.0|           1.0E-16|               0.0|
|  5|2001| 0.614|-0.623| 1.03

## *4.6.3 Linear Discriminant Analysis*

## *4.6.4 Quadratic Discriminant Analysis*

## *4.6.5 K-Nearest Neighbors*