In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [15]:
wine = pd.read_csv('Wine.csv')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [16]:
X = wine.iloc[:,0:11].values
y = wine.iloc[:,11].values
print(X,y)

[[ 7.4    0.7    0.    ...  3.51   0.56   9.4  ]
 [ 7.8    0.88   0.    ...  3.2    0.68   9.8  ]
 [ 7.8    0.76   0.04  ...  3.26   0.65   9.8  ]
 ...
 [ 6.3    0.51   0.13  ...  3.42   0.75  11.   ]
 [ 5.9    0.645  0.12  ...  3.57   0.71  10.2  ]
 [ 6.     0.31   0.47  ...  3.39   0.66  11.   ]] [5 5 5 ... 6 5 6]


###  Creation of train and test sets from the data.

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

###  Scaling with Standardization


In [18]:

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### PCA

In [19]:
from sklearn.decomposition import PCA

pca = PCA(n_components= 2)  # we will reduce the data set from 11 columns to 2 columns.
X_train2 = pca.fit_transform(X_train) # fit means train, fit_transform means train and apply to a data set.
X_test2 = pca.transform(X_test)  # Only transformation


### logistic regression before PCA

In [21]:
from sklearn.linear_model import LogisticRegression

# random_state = 0 because the model will be used two times and we want to have same structure.
# Thus, same LR algorithm structure will run.
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

### LR after PCA transformation


In [22]:
classifier2 = LogisticRegression(random_state=0)
classifier2.fit(X_train2, y_train)

LogisticRegression(random_state=0)

### Predictions


In [23]:

# Prediction from the data that is not applied PCA.
y_pred = classifier.predict(X_test) 

# Prediction from the data that is applied PCA.
y_pred2 = classifier2.predict(X_test2)

In [24]:
# Evaluation

from sklearn.metrics import confusion_matrix

# actual / result without PCA
print('actual / without PCA')
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('          ')

# actual / result with PCA
print('actual / with PCA')
cm2 = confusion_matrix(y_test, y_pred2)
print(cm2)
print('          ')

# after PCA / before PCA
print('without PCA / with PCA')
cm3 = confusion_matrix(y_pred, y_pred2)
print(cm3)

actual / without PCA
[[  0   0   2   0   0   0]
 [  0   0   6   4   1   0]
 [  0   0 104  30   1   0]
 [  0   0  37  90  15   0]
 [  0   0   2  16   9   0]
 [  0   0   0   1   2   0]]
          
actual / with PCA
[[ 0  0  0  2  0  0]
 [ 0  0  4  7  0  0]
 [ 0  0 89 45  1  0]
 [ 0  0 55 81  6  0]
 [ 0  0  4 21  2  0]
 [ 0  0  0  2  1  0]]
          
without PCA / with PCA
[[108  43   0]
 [ 40  99   2]
 [  4  16   8]]


# LDA (Linear Discriminant Analysis)

In [25]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components = 2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

### LR after LDA transformation

In [26]:
classifier_lda = LogisticRegression(random_state=0)
classifier_lda.fit(X_train_lda, y_train)

LogisticRegression(random_state=0)

### Predictions of LDA data

In [27]:
y_pred_lda = classifier_lda.predict(X_test_lda)  

### Evaluation with Confusion matrix

In [28]:


# original / After LDA 
print('Original & LDA')
cm4 = confusion_matrix(y_pred, y_pred_lda)
print(cm4)
print('          ')

# actual / result without LDA
print('actual / without LDA')
cm5 = confusion_matrix(y_test, y_pred)
print(cm5)
print('          ')

# actual / result with LDA
print('actual / with LDA')
cm6 = confusion_matrix(y_test, y_pred_lda)
print(cm6)

Original & LDA
[[149   2   0]
 [ 11 128   2]
 [  0   6  22]]
          
actual / without LDA
[[  0   0   2   0   0   0]
 [  0   0   6   4   1   0]
 [  0   0 104  30   1   0]
 [  0   0  37  90  15   0]
 [  0   0   2  16   9   0]
 [  0   0   0   1   2   0]]
          
actual / with LDA
[[  0   0   2   0   0   0]
 [  0   0   6   5   0   0]
 [  0   0 106  29   0   0]
 [  0   0  45  85  12   0]
 [  0   0   1  16  10   0]
 [  0   0   0   1   2   0]]
