In [17]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [18]:
col_names = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins','Color intensity', 'Hue', 'OD280/OD315 of diluted wines','Proline']

# Load the data
df = pd.read_csv(r'C:/Users/User/Desktop/wine_info.csv', header = None, names = col_names)
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [19]:
# Multiclass classification strategies
# While logistic regression is certainly not the only algorithms that were originally meant for binary classification (perceptrons and support vector machines), it is a known weakness of the algorithm. 
# Whilethere are ways to extend logistic regression to multiclass (more than two class) problems, this section covers general ways to extend binary classification algorithms to multiclass.
# The two approaches covered here are the following:
# • The One-vs-Rest (One-vs-All) strategy splits a multi-class classification into one binary classification problem per class.
# • The One-vs-One strategy splits a multi-class classification into one binary classification problem per each pair of classes.
# One versus all theoretical example
# For example, given a multi-class classification problem with examples for each class ‘setosa,’ ‘versicolor,’ and ‘virginica‘. This could be divided into three binary classification datasets as follows:
# • Binary Classification Problem 1: setosa vs [versicolor, virginica]
# • Binary Classification Problem 2: versicolor vs [setosa, virginica]
# • Binary Classification Problem 3: virginica vs [setosa, versicolor]
# This approach requires that each model predicts a class membership probability or a probability-like score. The argmax of these scores (class index with the largest score) is then used to predict a class.
# This approach is commonly used for algorithms that naturally predict numerical class membership probability or score, such as logistic regression.
# As such, the implementation of these algorithms in the scikit-learn library implements the OvR strategy by default when using these algorithms for multi-class classification. The scikit-learn library also
# provides a separate OneVsRestClassifier class that allows the one-vs-rest strategy to be used with any classifier.

In [20]:
# Print out how many classes
print('Class labels', np.unique(df['Class label']))

# Classes aren't balanced.
df['Class label'].value_counts(dropna = False)

Class labels [1 2 3]


2    71
1    59
3    48
Name: Class label, dtype: int64

In [21]:
# Arrange data into features matrix and target vector
X = df.loc[:, df.columns[(df.columns != 'Class label')]]

y = df.loc[:, 'Class label'].values
# In statistical surveys,
# when subpopulations within an overall population vary,
# it could be advantageous to sample each subpopulation (stratum) independently.
# Stratification is the process of dividing members of the population into homogeneous subgroups before sampling.
#help(train_test_split)
# Split into training and test sets
# Providing the class label array y as an argument to stratify ensures both
# the training set and test datasets have the same class proportions as the
# original dataset
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{1: 41, 2: 50, 3: 33}

In [22]:
# Standardize Data
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
log_reg = LogisticRegression(penalty='l1',
C=1.0,
solver='liblinear',
multi_class='ovr')

log_reg.fit(X_train, y_train)
print('Training accuracy:', log_reg.score(X_train, y_train))
print('Test accuracy:', log_reg.score(X_test, y_test))


Training accuracy: 1.0
Test accuracy: 1.0


In [23]:
# Both the training and test accuracies (both 100 percent) indicate that our model does a perfect job on both datasets. When you access the intercept terms via the `log_reg.intercept_` attribute, we see that
# the array returns three values.
log_reg.intercept_

# Since we fit the Logistic Regression object on a multiclass dataset via the OvR approach, the first intercept belongs to the model that fits class 1 versus classes 2 and 3, the second value is the intercept of
# the model that fits class 2 versus classes 1 and 3, and the third value is the intercept of the model that fits class 3 versus 1 and 2.
log_reg.coef_

array([[ 1.24596879,  0.18051402,  0.74618495, -1.16384232,  0.        ,
         0.        ,  1.1610691 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.55688114,  2.50902341],
       [-1.53736051, -0.38713904, -0.99556154,  0.36528274, -0.05971364,
         0.        ,  0.66852252,  0.        ,  0.        , -1.93442361,
         1.23339835,  0.        , -2.23107213],
       [ 0.13559719,  0.16841227,  0.35726656,  0.        ,  0.        ,
         0.        , -2.43776807,  0.        ,  0.        ,  1.56357362,
        -0.81895198, -0.49261971,  0.        ]])

In [24]:
# How predictions work
# Scikit-learn will output probabilities for each class. The highest probability will be the class prediction. So if you have three classes, scikit-learn will output three probabilities for the predict_prob method. If
# you have four classes scikit-learn will output 4 probabilities. In this case, we have three classes so the result will give us three probabilities. We choose the class with the highest probability.

# The first class is the highest score so it will be the predict 0 (the first class) for this data
log_reg.predict_proba(X_test[0:1])

# We can get the class associated with the highest probability too
log_reg.predict(X_test[0:1])

array([1], dtype=int64)

In [25]:
# One versus One theoretical example
# This approach was used for support vector machines (SVM) and related kernel-based algorithms which isn't covered in this course. This is because the performance of kernel methods does not scale in
# proportion to the size of the training dataset and using subsets of the training data may counter this effect.
# One-vs-One (OvO for short) is another heuristic method for using binary classification algorithms for multi-class classification.
# Like one-vs-rest, one-vs-one splits a multi-class classification dataset into binary classification problems. Unlike one-vs-rest that splits it into one binary dataset for each class, the one-vs-one approach splits
# the dataset into one dataset for each class versus every other class.

# For example, given a multi-class classification problem with examples for each class ‘setosa,’ ‘versicolor,’ and ‘virginica‘. This could be divided into three binary classification datasets as follows:
# • Binary Classification Problem 1: setosa vs versicolor
# • Binary Classification Problem 2: versicolor vs virginica
# • Binary Classification Problem 3: setosa vs virginica

# All of this may not seem very different than One-vs-Rest (One-vs-All), but this method has O(n_classes^2) complexity which means that it is slower to perform. When you have n_classes, you will need to fit
# n_classes * (n_classes - 1) / 2 classifiers.

# Here is an example to show how out of hand this process can grow. Imagine you have 10 classes which are the digits 0-9. This means we will have to train 45 separate classifiers. Training one model can
# take time (depending on the model and how much data you have), but training 45 separate ones is time consuming.
# (NumClasses * (NumClasses – 1)) / 2
# (10 * (10 – 1)) / 2
# (10 * 9) / 2
# 90 / 2
# 45