## PCA 

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('wine_chemical_compositions.csv')

#slice the variables
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)      
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
chem_composition_var = pca.explained_variance_ratio_

In [2]:
ev = pd.DataFrame(chem_composition_var)
ev

Unnamed: 0,0
0,0.368841
1,0.193184
2,0.107529
3,0.07422
4,0.062459
5,0.04909
6,0.041173
7,0.02496
8,0.023089
9,0.018641


######  The cumulative variance of principal chemical component of the wine will explain(0.368841 (36.9%) for the first component and top 2 will explain 56.20% of the variance

In [3]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)           #instead of None -> 2
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
chem_composition_var = pca.explained_variance_ratio_

In [6]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# #remove the warning
# from warnings import simplefilter
# simplefilter(action='ignore', category=FutureWarning)

[[14  0  0]
 [ 1 15  0]
 [ 0  0  6]]


##### ACCURACY (PCA)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.9722222222222222

In [9]:
precision_score(y_test, y_pred, average = 'micro')  # TP / (TP + FP)

0.9722222222222222

In [11]:
recall_score(y_test, y_pred, average = 'micro')     # TP / (TP + FN)

0.9722222222222222

In [12]:
f1_score(y_test, y_pred, average = 'micro')

0.9722222222222222

## RFE

In [13]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('wine_chemical_compositions.csv')

#slice the variables
X = dataset.drop(columns = ['Customer_Segment'])
y = dataset['Customer_Segment']


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns = 'Customer_Segment'), dataset['Customer_Segment'],
                                                    test_size = 0.2,
                                                    random_state = 0)


# Feature Scaling (Place all varaible on the same scale)
from sklearn.preprocessing import StandardScaler
stdScaler = StandardScaler()
X_train = pd.DataFrame(stdScaler.fit_transform(X_train.astype(float)))
X_test = pd.DataFrame(stdScaler.transform(X_test.astype(float)))

In [14]:
# Feature Selection | Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)

# Select Best X Features
rfe = RFE(classifier, 2)
rfe = rfe.fit(X_train, y_train)

#### CHECK HOW RFE WILL RANK THE FEATURES

In [15]:
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False  True False False  True False False
 False]
[ 3  8  6  7 11 12  1 10  9  1  4  5  2]


In [16]:
# Fitting Model to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train[X_train.columns[rfe.support_]], y_train)

# Predicting Test Set
y_pred = classifier.predict(X_test[X_train.columns[rfe.support_]])

##### ACCURACY (RFE)

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.8333333333333334

In [22]:
precision_score(y_test, y_pred, average = 'micro')  # TP / (TP + FP)

0.8333333333333334

In [23]:
recall_score(y_test, y_pred, average = 'micro')     # TP / (TP + FN)

0.8333333333333334

In [24]:
f1_score(y_test, y_pred, average = 'micro')

0.8333333333333334