# K-Means Clustering

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
#reading the data
dataset=pd.read_csv('Data/DirectMarketing_Cln_Classification.csv')

print(dataset)
X = dataset.iloc[:, [3, 4]].values

In [None]:
print(X)

In [None]:
plt.scatter(dataset.iloc[:, [3]].values,dataset.iloc[:, [4]].values)
plt.title('Income vs. Spending score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

## Using the elbow method to find the optimal number of clusters

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

## Training the K-Means model on the dataset

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
X[y_kmeans == 0, 0]

## Visualising the clusters

In [None]:
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

In [None]:
cluster_1 = pd.DataFrame({'AnnualIncome':X[y_kmeans == 0, 0], 'SpendingScore': X[y_kmeans == 0, 1]})
print(cluster_1)
print(cluster_1['AnnualIncome'].std())

In [None]:
cluster_2 = pd.DataFrame({'AnnualIncome':X[y_kmeans == 1, 0], 'SpendingScore': X[y_kmeans == 1, 1]})
cluster_2.describe()

In [None]:
cluster_3 = pd.DataFrame({'AnnualIncome':X[y_kmeans == 2, 0], 'SpendingScore': X[y_kmeans == 2, 1]})
cluster_3.describe()

In [None]:
cluster_4 = pd.DataFrame({'AnnualIncome':X[y_kmeans == 3, 0], 'SpendingScore': X[y_kmeans == 3, 1]})
cluster_4.describe()


In [None]:
cluster_5 = pd.DataFrame({'AnnualIncome':X[y_kmeans == 4, 0], 'SpendingScore': X[y_kmeans == 4, 1]})
cluster_5.describe()

Model Interpretation

. Number of records in each cluster

In [None]:
cl_1 =  pd.DataFrame({'Variablename':['AnnualIncome','SpendingScore'],'Cluster':[1,1],'Count':[cluster_1['AnnualIncome'].count(),cluster_1['SpendingScore'].count()],'Mean':[cluster_1['AnnualIncome'].mean(),cluster_1['SpendingScore'].mean()],'Std':[cluster_1['AnnualIncome'].std(),cluster_1['SpendingScore'].std()]})
cl_1


In [None]:
cl_2 =  pd.DataFrame({'Variablename':['AnnualIncome','SpendingScore'],'Cluster':[2,2],'Count':[cluster_2['AnnualIncome'].count(),cluster_2['SpendingScore'].count()],'Mean':[cluster_2['AnnualIncome'].mean(),cluster_2['SpendingScore'].mean()],'Std':[cluster_2['AnnualIncome'].std(),cluster_2['SpendingScore'].std()]})
cl_2



In [None]:
cl_3 =  pd.DataFrame({'Variablename':['AnnualIncome','SpendingScore'],'Cluster':[3,3],'Count':[cluster_3['AnnualIncome'].count(),cluster_3['SpendingScore'].count()],'Mean':[cluster_3['AnnualIncome'].mean(),cluster_3['SpendingScore'].mean()],'Std':[cluster_3['AnnualIncome'].std(),cluster_3['SpendingScore'].std()]})
cl_3




In [None]:
cl_4 =  pd.DataFrame({'Variablename':['AnnualIncome','SpendingScore'],'Cluster':[4,4],'Count':[cluster_4['AnnualIncome'].count(),cluster_4['SpendingScore'].count()],'Mean':[cluster_4['AnnualIncome'].mean(),cluster_4['SpendingScore'].mean()],'Std':[cluster_4['AnnualIncome'].std(),cluster_4['SpendingScore'].std()]})
cl_4

In [None]:
cl = pd.concat([cl_1,cl_2,cl_3,cl_4])
cl

In [None]:
annulaIncome = cl['Variablename']=='AnnualIncome'
cl[annulaIncome]