In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Data

In [None]:
df= pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Exploratory Data Analysis

In [None]:
df.profile_report()

In [None]:
df['Gender']= pd.get_dummies(df['Gender'])

In [None]:
df.head(10)

In [None]:
df.hist(figsize=(10,10))

In [None]:
sns.scatterplot(x= df['Annual Income (k$)'], y= df['Age'], hue=df['Gender'])

In [None]:
sns.scatterplot(x= df['Annual Income (k$)'], y= df['Spending Score (1-100)'], hue=df['Gender'])

In [None]:
sns.scatterplot(x= df['Annual Income (k$)'], y= df['Spending Score (1-100)'], hue=df['Age'])

# Observations:
* Female footfall is more than male footfall in the mall
* Age Group 20-25 and 30-40 are most spotted in mall
* Most shoppers have annual income 60-80k
* Most of the shoppers with Spending score 40-60 have an annual income of 40-60k
* People with lower income tend to have spending score
* Age Group 20-30 have the highest spending scores

In [None]:
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram


# Preprocessing the Data

In [None]:
num_cols= ['Annual Income (k$)', 'Spending Score (1-100)', 'Age']

norm= normalize(df[num_cols])
df_scaled= pd.DataFrame(norm, columns= num_cols)
df_scaled.head()
             

In [None]:
df_scaled['Gender']= df['Gender']

In [None]:
df_scaled

# Hierarchical Clustering

In [None]:
dmatrix= linkage(df_scaled[['Age',  'Spending Score (1-100)','Annual Income (k$)' ]], method= 'complete', metric= 'euclidean')


In [None]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    dmatrix,
    leaf_font_size=8, 
)
plt.show()

# K-Means Clustering

In [None]:
distortions= []
for i in range(1,30):
    kmeans= KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, tol=0.0001,random_state= 100, algorithm= 'full')
    kmeans.fit(df_scaled[num_cols])
    distortions.append(kmeans.inertia_)
    
plt.plot(np.arange(1,30,1), distortions,  alpha= 0.5)       
plt.plot(np.arange(1,30,1), distortions,'o',  alpha= 0.5)    
plt.show()


In [None]:
df_scaled

In [None]:
kmeans= KMeans(n_clusters=5,  init='k-means++', n_init=30, max_iter=300, tol=0.0001,random_state= 100, algorithm= 'full')
kmeans.fit(df_scaled[num_cols])
df_scaled['cluster_labels']= kmeans.labels_

In [None]:
df_scaled

In [None]:
sns.scatterplot(x= 'Annual Income (k$)', y= 'Spending Score (1-100)', hue='cluster_labels', data= df_scaled, palette= 'bright')