# Kmeans with PCA 

# Importing all the libraries

In [None]:
# Step1 - Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the food_agro file and handling the null values in it

In [None]:
food_agro = pd.read_csv("Project_Data_1.csv",thousands=',',index_col=0)

print(food_agro.isnull().values.any())
print(food_agro.isnull().sum())
#so as you see there is no null values in any of the column in the dataset.
food_agro.dropna()
#Also there is no nan values
print(food_agro)

# Applying the principal component analysis on the file

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(food_agro)


In [None]:
print(pca.explained_variance_ratio_.round(2))

# Plotting the variance in a graph in order to identify the ideal number of principal components. So as per the graph the efficiency is growing more than 95% after 3.

In [None]:
plt.figure(figsize=(10,8))
plt.plot(range(1,19),pca.explained_variance_ratio_.cumsum(),marker='o',linestyle="--")

# Fitting 3 as the number of components for PCA

In [None]:
pca=PCA(n_components=3)
pca.fit(food_agro)

# Applying the PCA on the data, and adopting the elbow method to identify the optimal number of clusters for the data

In [None]:
scores_pca=pca.transform(food_agro)

In [None]:
from sklearn.cluster import KMeans
wcd = []
for i in range(1,10):
    kmeans_pca = KMeans(n_clusters=i,init='k-means++')
    kmeans_pca.fit(scores_pca)
    wcd.append(kmeans_pca.inertia_)
plt.plot(range(1,10),wcd)
plt.title("Elbow curve")
plt.xlabel("No of Cluster = K")
plt.ylabel("Within Cluster distance")

# Applying the K-Means algorithm on the PCA applied data with 3 as the number of clusters

In [None]:
kmeans_pca = KMeans(n_clusters=3,init='k-means++')

y_kmeans = kmeans_pca.fit(scores_pca)
print(y_kmeans)

# Merging the data with its PCA values and the cluster number 

In [None]:
food_agro_pca_kmeans=pd.concat([food_agro.reset_index(drop=True),pd.DataFrame(scores_pca)],axis=1)
food_agro_pca_kmeans.columns.values[-3:]=['Component1','Component2','Component3']
food_agro_pca_kmeans['Segment K-Means']=kmeans_pca.labels_
print(food_agro_pca_kmeans.head())

# Plotting the clusters based the PCA in the graph

In [None]:
x_axis=food_agro_pca_kmeans['Component1']
y_axis=food_agro_pca_kmeans['Component2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis,y_axis,hue=food_agro_pca_kmeans['Segment K-Means'],palette=['r','g','b'])
sns.scatterplot(kmeans_pca.cluster_centers_[:,0],kmeans_pca.cluster_centers_[:,1],kmeans_pca.cluster_centers_[:,2],s=100,c='pink',label='Centres')
plt.show()