# Import Libraries

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Data & Rename Columns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
print(df.shape)
df.head()

In [None]:
df.rename(columns={'Annual Income (k$)' : 'Income', 'Spending Score (1-100)' : 'Spending_Score'}, inplace = True)
df.head()

# Checking Descriptive Statistics

In [None]:
df.describe()

In [None]:
#Plot Age, Income and Spending Score Correlation
sns.pairplot(df[['Age','Income', 'Spending_Score']])

# Elbow Method to Identify the optimal number of Clusters


In [None]:
df_short =df[['Income','Spending_Score']]
df_short

In [None]:
import sklearn.cluster as cluster
K=range(1,12)
wcss = []
for k in K:
    kmeans=cluster.KMeans(n_clusters=k,init="k-means++",random_state= 42)
    kmeans=kmeans.fit(df_short)
    wcss_iter = kmeans.inertia_
    wcss.append(wcss_iter)

In [None]:
#Storing the wcss for different clusters 
mycenters = pd.DataFrame({'Clusters' : K, 'WCSS' : wcss})
mycenters

In [None]:
plt.plot(mycenters["Clusters"], mycenters["WCSS"])
plt.xlabel("Clusters")
plt.ylabel("WCSS")

### 5 Clusters are identified as per elbow method

# Silhouette Method to Indentify Clusters

In [None]:
import sklearn.metrics as metrics

for i in range(3,13):
    labels=cluster.KMeans(n_clusters=i,init="k-means++",random_state=200).fit(df_short).labels_
    print ("Silhouette score for k(clusters) = "+str(i)+" is "
           +str(metrics.silhouette_score(df_short,labels,metric="euclidean",sample_size=1000,random_state=200)))

# Perform K-Mean Clustering with 5 Clusters

In [None]:
# We will use 2 Variables for this example
kmeans = cluster.KMeans(n_clusters=5 ,init="k-means++")
kmeans = kmeans.fit(df[['Spending_Score','Income']])

In [None]:
kmeans.cluster_centers_

# Attach Clusters to the Original Data 

In [None]:
df['Clusters'] = kmeans.labels_

In [None]:
df.head()

In [None]:
df['Clusters'].value_counts()

# Export Data with Clusters

In [None]:
df.to_csv('mallClusters.csv', index = False)

# Visualizing Clusters 

In [None]:
g=sns.scatterplot(x="Spending_Score", y="Income",hue = 'Clusters',  data=df, palette=['green','orange','brown','dodgerblue','red'])
g.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'black', label = 'Centroid', marker="*")  

# END