In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing and reading the dataset** **(EDA)**

In [None]:
#import the data with the link using pandas library

data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
data.head() #visualizing the first 5 rows of the data in a table

In [None]:
#getting more information about the data
data.info()

#all columns are filled and gender column has the object data

In [None]:
# using the describe to get other numerical infromation about the data

data.describe()
# we might need to scale the data later

**Visualization of dataset**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt #libraries for visualization

In [None]:
fig = plt.figure(figsize = (6,5))

# using boxplot gives quick information about the medain, upper&lower quartile,and even outliers in the data
for x in data.select_dtypes(np.number).columns:
    sns.boxplot(x=data[x])
    plt.show()

In [None]:
# for the object column(gender), count plot to sure there isnt bias in the data

sns.countplot(data['Gender'])

In [None]:
sns.heatmap(data.corr(), annot= True) #for the numerical correlation/dependence of a feature and another

#no clear dependence/correlation of one column on the other

In [None]:

plt.figure(figsize= (10,7))
sns.scatterplot( x =data['Annual Income (k$)'], y =data['Spending Score (1-100)'],
                  hue = data["Gender"]) #scatterplot to visualize the relationship between the annual income and the spending score


**Preprocessing**

In [None]:
#scaling the dataset to ensure we are working on the same scale
from sklearn.preprocessing import StandardScaler

df = data.copy() #creating a copy of the original dataset
df_num = df.select_dtypes(np.number) #selecting columns with numerical value

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_num) #calling fit transform on the numerical columns of the data. 
#It joins the fit() and transform() method for the transformation of the dataset.
scaled_df = pd.DataFrame(scaled_df, columns= df_num.columns) #fit_transforming gives back the data in an array,
#hence you have to make it a dataframe back by calling pd.Dataframe
scaled_df

In [None]:
# converting the gender column to numerical value

df['Gender'] = df['Gender'].astype('category').cat.codes

In [None]:
processed_df = pd.concat([scaled_df, df['Gender']], axis=1) #rejoin the two separate columns
processed_df

In [None]:
#we will drop the id column as it doesnt really give any use to the model

processed_df = processed_df.drop('CustomerID', axis = 1) #axis has to be set to one so it can look for it in the column and not the row

**Creating clusters with Kmeans**

In [None]:
from sklearn.cluster import KMeans

# Create and fit a range of models
km_list = list()

for clust in range(1,11):
    km = KMeans(n_clusters = clust, random_state=42)
    km = km.fit(processed_df) #fit the model on the dataset
    
    km_list.append(pd.Series({'clusters': clust, 
                              'inertia': km.inertia_,
                              'model': km}))

In [None]:
plot_data = (pd.concat(km_list, axis=1)
             .T
             [['clusters','inertia']]
             .set_index('clusters'))

ax = plot_data.plot(marker='o',ls='-')
ax.set_xticks(range(0,11,2))
ax.set_xlim(0,11) #the limit of the labels on x_axis
ax.set(xlabel='Cluster', ylabel='Inertia');

**Dendogram**

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram #import the dendogram and linkage from scipy library


plt.figure(figsize= (15,7))
merg = linkage(processed_df, method = "ward", metric='euclidean')#ward is a type of clustering method,
#euclidean distance as metrics for clustering
dendrogram(merg, truncate_mode='lastp')
plt.xlabel("Data Point")
plt.ylabel("Euclidean Distance")

In [None]:
X = processed_df.drop('Gender', axis = 1)

**Agglomerative clustering**

In [None]:
from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters = 6, affinity = "euclidean", linkage = "ward")
cluster = hc.fit_predict(X) #fit agglom model on the data set

X["Label"] = cluster

sns.scatterplot(X['Annual Income (k$)'], X['Spending Score (1-100)'], hue = X['Label'])

**Confirming optimum number of clusters**

![](https://cdn-images-1.medium.com/max/800/1*9J7Wnh5L0eIcHXBeWlzvNA.png)

In [None]:
def calculate_wcss(X):
        wcss = []
        for n in range(2, 21):
            kmeans = KMeans(n_clusters=n)
            kmeans.fit(X=X)
            wcss.append(kmeans.inertia_)
    
        return wcss


In [None]:
def optimal_number_of_clusters(wcss):
    x1, y1 = 2, wcss[0]
    x2, y2 = 20, wcss[len(wcss)-1]

    distances = []
    for i in range(len(wcss)):
        x0 = i+2
        y0 = wcss[i]
        numerator = np.abs((y2-y1)*x0 - (x2-x1)*y0 + x2*y1 - y2*x1)
        denominator = np.sqrt((y2 - y1)**2 + (x2 - x1)**2)
        distances.append(numerator/denominator)
    
    return distances.index(max(distances)) + 2

In [None]:
# calculating the within clusters sum-of-squares for  cluster amounts
sum_of_squares = calculate_wcss(scaled_df)
    
    # calculating the optimal number of clusters
n = optimal_number_of_clusters(sum_of_squares)
n

**Modelling the data with the optimum cluster of 6**

In [None]:
# final model with k=6
kmeans = KMeans(n_clusters = 6, max_iter = 100, random_state = 42)

kmeans.fit(processed_df) #fit the model with 6 clusters on the data

In [None]:
# assign the label
df['cluster_id'] = kmeans.labels_
df.head()

In [None]:
print(df['cluster_id'].value_counts()) #print the amount of samples that belong to each cluster

sns.countplot(df['cluster_id']) #visualize it

In [None]:
# plot to visualize the distribution of age of each cluster
plt.title('Age')
sns.boxplot(x='cluster_id', y='Age', data = df)
plt.show()

In [None]:
# plot to visualize the distribution of annual income of each cluster
plt.title('Annual Income (k$)')
sns.boxplot(x='cluster_id', y='Annual Income (k$)', data = df)
plt.show()

In [None]:
# plot to visualize the distribution of spending score of each cluster
plt.title('Spending Score (1-100)')
sns.boxplot(x='cluster_id', y='Spending Score (1-100)', data= df)
plt.show()

In [None]:
sns.countplot(data = df , hue ='Gender', x ='cluster_id')

In [None]:
#scatter plot gdpp-child_mort
import plotly.express as px

fig = px.scatter(df, x ="Spending Score (1-100)", y = "Age", color = "cluster_id") #spending score vs Age of each cluster
fig.show()

In [None]:
#scatter plot 

fig = px.scatter(df, x ="Spending Score (1-100)", y ="Annual Income (k$)", color ="cluster_id")
#spending score vs annual income of each cluster
fig.show()

In [None]:
#scatter plot 

fig = px.scatter(df, x = "Annual Income (k$)", y ="Age", color ="cluster_id") #spending annual income vs Age of each cluster
fig.show()

In [None]:
grouped = df.groupby('cluster_id') #group the dataframe by clusterid

grouped['Age', 'Annual Income (k$)','Spending Score (1-100)'].mean().sort_values(
        by = ['Age', 'Annual Income (k$)','Spending Score (1-100)'], ascending=[True, True, True])

##**CONCLUSION**

In [None]:
# cluster 5 customers have the lowest average age(young), lowest average income but have highest spending average

# cluster 0 belongs to mostly aged people with average income and average spending score

#cluster 1 belongs to mid aged people with high income and high spending score

#cluster 4 belongs to mid aged people with high income and low spending score

#cluster 3 belongs to aged people with low income and low spending score

#cluster 2 belongs to young people with high income and medium spending score
