## Demo for Knn Classificaiton and PCA on Wine Dataset


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


## 1. Loading data

In [None]:
names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','diluted wines','Proline']    
df = pd.read_csv('https://raw.githubusercontent.com/sekhargullapalli/exploring-data-science/master/data/wine.csv', header=None, names=names)  
df.head() 

## 2. Seperate date into labels and features & z-score normalization

In [None]:
dfCat =df[['class']]
del df['class']
df = (df - df.mean())/df.std()

## 3. Split into test train sets and Knn Classification with k=5

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, dfCat, test_size = 0.25, random_state = 0)
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

## 4.Plotting the confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)    
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)
ax.set_xlabel('Predicted class');ax.set_ylabel('Actual class'); 
ax.set_title('Confusion Matrix'); 
lab = dfCat["class"].unique().tolist()
ax.xaxis.set_ticklabels(lab); ax.yaxis.set_ticklabels(lab)

## Section 2: Dimensionality reduction using PCA
=========================================================

## 5. Calculating 3 principal components and order reduced dataframe

In [None]:
principal_components = PCA(n_components=3).fit_transform(df)
pcDf = pd.DataFrame(data=principal_components, columns=['PC1','PC2','PC3'])    
loworderDf = pd.concat([pcDf,dfCat],axis=1)   
loworderDf.head()

## 6. Plotting the wine class distribution in new space

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') 
lab = dfCat["class"].unique().tolist()
col=['r','g','b']
for l,c in zip(lab,col):
    subclass = loworderDf['class']==l
    ax.scatter(loworderDf.loc[subclass,'PC1']
    ,loworderDf.loc[subclass,'PC2']
    ,loworderDf.loc[subclass,'PC3']
    ,c = c, s=30)
ax.legend(lab)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3') 
ax.set_title('Wine class distribution in reduced dimensions')
ax.grid()

## 7. Performing Knn classification on order reduced data and plotting the confusion matrix

In [None]:
dfCat =loworderDf[['class']]
del loworderDf['class']
loworderDf = (loworderDf - loworderDf.mean())/loworderDf.std()
X_train, X_test, y_train, y_test = train_test_split(loworderDf, dfCat, test_size = 0.25, random_state = 0)
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)    
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax)
ax.set_xlabel('Predicted class');ax.set_ylabel('Actual class'); 
ax.set_title('Confusion Matrix'); 
lab = dfCat["class"].unique().tolist()
ax.xaxis.set_ticklabels(lab); ax.yaxis.set_ticklabels(lab)