# Load the required libraries including seaborn and PCA

In [None]:
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn import covariance
import seaborn as sns
import numpy as np
from numpy import linalg as LA
from sklearn.decomposition import PCA

# Get the data from Wine.csv file in the input folder

In [None]:
Input_Data = pd.read_csv("../input/wine-customer-segmentation/Wine.csv")

# Do a basic analysis of data

In [None]:
Input_Data.describe()

In [None]:
Input_Data.head()

# Normalize all the features

In [None]:
DataColumnNames = Input_Data.columns
Input_Data_List_Temp = []
for DataColumnName in DataColumnNames:
    Input_Data_List_Temp.append(preprocessing.normalize([Input_Data[DataColumnName].to_list()],norm='max')[0])
Input_Data_List = np.transpose(Input_Data_List_Temp)

# Plot the correlation matrix of the normalized data

In [None]:
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(pd.DataFrame(Input_Data_List).corr(),annot=True,annot_kws={"size":8},xticklabels=DataColumnNames, yticklabels=DataColumnNames)
plt.title('Covariance matrix showing correlation coefficients')
plt.tight_layout()
plt.show()

# Let us take two features alone from this dataset which shows highest correlation (Flavanoids and Total_Phenols) - 0.86 correlation

In [None]:
Flavanoids = Input_Data['Flavanoids'].to_list()
Total_Phenols = Input_Data['Total_Phenols'].to_list()
Flavanoids_Normalized = preprocessing.normalize([Flavanoids],norm='max')[0]
Total_Phenols_Normalized = preprocessing.normalize([Total_Phenols],norm='max')[0]
FlavanoidsMean = np.array(Flavanoids_Normalized).mean()
Total_PhenolsMean = np.array(Total_Phenols_Normalized).mean()

# Plot the two selected features in a plane. We can easily observe that they are positively related

In [None]:
plt.scatter(Flavanoids_Normalized,Total_Phenols_Normalized,marker='x')

# Append both normalized features to form a single matrix

In [None]:
SelectedFeaturesTransposed = []
SelectedFeaturesTransposed.append(Flavanoids_Normalized)
SelectedFeaturesTransposed.append(Total_Phenols_Normalized)
SelectedFeatures = np.transpose(SelectedFeaturesTransposed)

# Create a matrix by subtracting the means of features from each data point. By doing this we bring the means of the features to the origin. And all data points shift towards the origin

## We do this to make the computation of correlation matrix easier

In [None]:
FlavanoidsSubtracted = np.add(SelectedFeaturesTransposed[0],-FlavanoidsMean)
Total_PhenolsSubtracted = np.add(SelectedFeaturesTransposed[1],-Total_PhenolsMean)

# The plot below shows that the means of the features shifted to origin

In [None]:
plt.scatter(FlavanoidsSubtracted,Total_PhenolsSubtracted,marker='x')

# Combine the shifted features to a single matrix

In [None]:
SelectedFeaturesSubtracted = np.array([FlavanoidsSubtracted,Total_PhenolsSubtracted])

# Compute the covariance matrix

In [None]:
S = SelectedFeaturesSubtracted.dot(SelectedFeaturesSubtracted.T) / 178

In [None]:
S

# Find the Eigen value and Eigen vector of the covariance matrix

### The first Eigen vector corresponds to the highest Eigen value hence it becomes PC1

In [None]:
EigenValue,EigenVector = LA.eig(S)
print(EigenVector)
print(EigenValue)

# Compute projections of datapoints on PC1 and print the variance of projections

In [None]:
PC1 = EigenVector.T[0].dot(SelectedFeaturesTransposed)
print(np.var(PC1))

# Compute projections of datapoints on PC1 and print the variance of projections

In [None]:
PC2 = EigenVector.T[1].dot(SelectedFeaturesTransposed)
print(np.var(PC2))

# Plot Principal components in the actual feature space

In [None]:
plt.scatter(FlavanoidsSubtracted,Total_PhenolsSubtracted,marker='x')
plt.quiver([0, 0], [0, 0], EigenVector[0], EigenVector[1], scale=3, color=['r','g'], label=['PC1','PC2'])
plt.text(0.2,0.3,'PC1')
plt.text(-0.2,0.3,'PC2')

# Plot the data point on the principal components PC1 and PC2

In [None]:
plt.scatter(PC1,PC2,marker='x')
plt.title("PCA Plot")
plt.xlabel("PC1")
plt.ylabel("PC2")

# SciKit provides PCA class. This is how to compute PCA with SciKit

In [None]:
# Provide the number of principal components required - 2 for plotting points on a plane
pca = PCA(n_components=2)

# Fit the data and get the principal components

In [None]:
pca.fit(SelectedFeatures)
# print the Principal component vectors - Eigen vectors
print(pca.components_)
# Print the covariance matrix
print(pca.get_covariance())

# Compute the projection of data points on the principal components

In [None]:
PC1 = pca.components_[0].dot(SelectedFeaturesTransposed)
print(np.var(PC1))
PC2 = pca.components_[1].dot(SelectedFeaturesTransposed)
print(np.var(PC2))

# Plot the PCA - We get same plot as above

In [None]:
plt.scatter(PC1,PC2,marker='x')
plt.title("PCA Plot")
plt.xlabel("PC1")
plt.ylabel("PC2")