In [None]:
# Import Library
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling as pp
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
import warnings
import os
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
warnings.filterwarnings("ignore")
py.offline.init_notebook_mode(connected = True)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
print(df.shape)
df.head()

In [None]:
# Basic Exploratory Analysis
print(df.dtypes)
print('---------------------------')
print(df.isnull().sum())

In [None]:
# Basic Exploratory Data Analysis
df.describe()

In [None]:
# In Depth Exploratory Data Analysis
pp.ProfileReport(df)

Insights:

1. No significant correlation between variables
2. Annual income more than 100 can only found in age range 30-50
3. Annual income vs spending score scatter plot illustrate that 5 cluster may produced

In [None]:
df.columns.tolist()

In [None]:
print(plt.style.available)

In [None]:
# Basic Exploratory Analysis : Data Visualization
# Understand the distribution of Age, Income, Spending score
plt.style.use('ggplot')
plt.figure(1 , figsize = (15 , 6), clear=True)
n = 0 
for x in df.columns[2:]:
    n += 1
    plt.subplot(1 , 3 , n)
    plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(df[x] , bins = 10)
    plt.title('Distplot of {}'.format(x))
plt.show()

In [None]:
plt.figure(1 , figsize = (15 , 4))
sns.countplot(x = 'Gender' , data = df)
plt.show()

In [None]:
sns.pairplot(df,hue='Gender')

In [None]:
# Remove Customer ID column
df = df.drop(columns=['CustomerID'], axis=1)
# Dummy Variable Creation 
# df = pd.concat([df, pd.get_dummies(df['Gender'])], axis=1).drop(columns=['Gender'], axis=1)
# Standardization of data
# ss = StandardScaler()
# df[df.columns] = ss.fit_transform(df[df.columns])

#Changing the gender column from categorical to numerical
df.loc[df.Gender == 'Male' ,'Gender'] = 1
df.loc[df.Gender == 'Female' ,'Gender'] = 0

df.Gender= df.Gender.astype(int)

In [None]:
model = KMeans()
# k is range of number of clusters.
visualizer = KElbowVisualizer(model, k=(2,20), timings= True)
visualizer.fit(df)        # Fit the data to the visualizer
visualizer.show()    

In [None]:
for n in range(2,10):
    model = KMeans(n_clusters = n)
    # k is range of number of clusters.
    visualizer = SilhouetteVisualizer(model)
    visualizer.fit(df)        # Fit the data to the visualizer
    visualizer.show()        # Finalize and render the figure

In [None]:
X3 = df[['Age' , 'Annual Income (k$)' ,'Spending Score (1-100)']].iloc[: , :].values
algorithm = (KMeans(n_clusters = 6 ,init='k-means++', n_init = 10 ,max_iter=300, 
                        tol=0.0001,  random_state= 111  , algorithm='elkan') )
algorithm.fit(X3)
labels3 = algorithm.labels_
centroids3 = algorithm.cluster_centers_
df['label3'] =  labels3

In [None]:
fig = px.scatter_3d(df, x=df['Age'], y=df['Spending Score (1-100)'], z=df['Annual Income (k$)'],
              color=df['label3'])
fig.show()