In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing data
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.sample(5)

**Date Exploration**

In [None]:
print('DF Shape:{}'.format(df.shape))
print('Data Types')
print(df.dtypes)
print('Null values Count')
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
df.groupby('Gender')['CustomerID'].count()

In [None]:
df.drop(['CustomerID'], axis = 1, inplace = True)
df.sample(5)

In [None]:
numeric_cols = df.drop(['Gender'], axis = 1).columns.tolist()
numeric_cols

In [None]:
for x in numeric_cols:
    #plt.subplot(1, 3, 2)
    plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
    sns.displot(df[x], bins = 20)
    plt.title('Displot of {}'.format(x))
    
plt.show()

In [None]:
plt.figure(figsize = (15,5))
sns.countplot(y = df.Gender, data = df)
plt.show()

In [None]:
plt.figure(1, figsize= (15, 6))
n = 0
for x in numeric_cols:
    n += 1
    plt.subplot(1,3,n)
    sns.set( style= 'whitegrid')
    plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
    sns.violinplot(x = x, y = 'Gender', data = df)
    plt.ylabel('Gender' if n == 1 else '')
    plt.title('Violin Plot')
    
plt.show()

In [None]:
for i in numeric_cols:
    if (numeric_cols.index(i) < 2):
        sns.relplot(x = df[i], y = df['Spending Score (1-100)'], data = df)
    else: break

In [None]:
def num_categorization(df, column, cuts, labels, xlabel, title):
    df['category_name'] = pd.cut(df[column], cuts , labels = labels, include_lowest=True )
    df_temp = pd.DataFrame(df.groupby('category_name')['Gender'].count().reset_index())
    df_temp.columns = ['category', 'Counts']
    sns.barplot(x = df_temp['category'], y = df_temp['Counts'], palette = 'rocket')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.show()

In [None]:
ss_cuts = [1, 20, 40, 60, 80, 100]
ss_labels = ['1-20', '21-40', '41-60', '61-80', '81-100']
num_categorization(df,'Spending Score (1-100)', ss_cuts, ss_labels, 'Spending Score Category', 'Spendin Score Distribution' )

In [None]:
age_cuts = [18,25,35,45,55,65, 100]
age_labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '>65']

num_categorization(df,'Age', age_cuts, age_labels, 'Age Category', 'Age Distribution' )

In [None]:
ai_cuts = [0,30,60,90,120,500]
ai_labels = ['0-30 K$', '31-60 K$', '61-90 K$', '91-120 K$', '>120 K$']

num_categorization(df,'Annual Income (k$)', ai_cuts, ai_labels, 'Annual Income Category', 'Annual Income Distribution' )

**K Means Clustering**

Figuring out how many clusters to use, using the elbow graph

In [None]:
df_to_cluster = df[numeric_cols]
wsc = []
for c in range(1,11):
    Kmeans = KMeans(n_clusters = c, init = 'k-means++')
    Kmeans.fit(df_to_cluster)
    wsc.append(Kmeans.inertia_)

In [None]:
plt.grid()
plt.plot(range(1,11), wsc, linewidth = 2, color = 'blue', marker = '8')
plt.xlabel('K value')
plt.ylabel('WSC')
plt.show()

> Seems like 5 or 6 clusters are most likely to be used

In [None]:
Kmeans = KMeans(n_clusters = 6, init = 'k-means++')
labels = Kmeans.fit_predict(df_to_cluster)
centroids = Kmeans.cluster_centers_ 

df_to_cluster['cluster_labels'] = labels

**Results Visualization**

In [None]:
fig = plt.figure(figsize = (20, 10))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(df_to_cluster.Age[df_to_cluster.cluster_labels == 0], df_to_cluster['Annual Income (k$)'][df_to_cluster.cluster_labels == 0], df_to_cluster['Spending Score (1-100)'][df_to_cluster.cluster_labels == 0], c = 'blue', s = 60)
ax.scatter(df_to_cluster.Age[df_to_cluster.cluster_labels == 1], df_to_cluster['Annual Income (k$)'][df_to_cluster.cluster_labels == 1], df_to_cluster['Spending Score (1-100)'][df_to_cluster.cluster_labels == 1], c = 'red', s = 60)
ax.scatter(df_to_cluster.Age[df_to_cluster.cluster_labels == 2], df_to_cluster['Annual Income (k$)'][df_to_cluster.cluster_labels == 2], df_to_cluster['Spending Score (1-100)'][df_to_cluster.cluster_labels == 2], c = 'green', s = 60)
ax.scatter(df_to_cluster.Age[df_to_cluster.cluster_labels == 3], df_to_cluster['Annual Income (k$)'][df_to_cluster.cluster_labels == 3], df_to_cluster['Spending Score (1-100)'][df_to_cluster.cluster_labels == 3], c = 'orange', s = 60)
ax.scatter(df_to_cluster.Age[df_to_cluster.cluster_labels == 4], df_to_cluster['Annual Income (k$)'][df_to_cluster.cluster_labels == 4], df_to_cluster['Spending Score (1-100)'][df_to_cluster.cluster_labels == 4], c = 'purple', s = 60)
ax.view_init(30, 185)
plt.xlabel('Age')
plt.ylabel('Annual Income (k$)')
ax.set_zlabel('Spending Score (1-100)')
plt.show()