In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing Libraries

In [None]:
# for visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams['figure.figsize'] = (18, 8)

# for interactive visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff

# for Model Building
from sklearn.cluster import KMeans

import warnings 
warnings.filterwarnings('ignore')

### Load the Data

In [None]:
df= pd.read_csv('/kaggle/input/mall-customers/Mall_Customers.csv')
data = ff.create_table(df.head())
py.iplot(data)

In [None]:
df.rename(columns={'Genre':'Gender'},inplace= True)

In [None]:
# Checking the shape of dataframe
df.shape

**As we can see there are 200 observations and 5 features.**

In [None]:
# summary statistics
df.describe()

In [None]:
# checking if there is any missing value present in dataset
df.isnull().sum()

### Exploratory Data Analysis(EDA)

#### Correlation Plot

In [None]:
sns.heatmap(df.corr(), cmap = 'plasma', annot = True)
plt.show()

In [None]:
sns.kdeplot(df['Age'], palette = 'OrRd')
plt.title('Distribution of Age', fontsize = 20)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(df['Gender'])
plt.title('Count Plot For Gender')
plt.show()

In [None]:
plt.figure(figsize=(24,8))
sns.countplot(df['Spending Score (1-100)'], palette = 'tab10')
plt.title('count plot of Spending Score', fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(df['Annual Income (k$)'], palette = 'tab10')
plt.title('Distribution of Annual Income', fontsize = 20)
plt.show()

### Data PreProcesssing

In [None]:
# Encoding Gender column
df['Gender'] = df['Gender'].map({'Female':0,'Male':1})

In [None]:
df.head()

In [None]:
# creating independent variables matrix
X = df.loc[:, ['Annual Income (k$)', 'Spending Score (1-100)']].values

### Clustering 

**Building the clustering model and calculating the values of Inertia:**

In [None]:
# Inertia: It is the sum of squared distances of samples to their closest cluster center.
inertia = []
for i in range(1, 15):
    ''' We iterate the values of k from 1 to 15 and 
    calculate the inertia for each value of k in the given range.'''
    ## Building and fitting the model
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

**Visualizing the result**

In [None]:
# The Elbow Method
# Plotting Number of Clusters Vs Inertia
plt.plot(range(1, 15), inertia,'bx-')
plt.title('The Elbow Method using Inertia')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')
plt.show()

**To determine the optimal number of clusters, we have to select the value of k at the “elbow” ie the point after which the inertia start decreasing in a linear fashion. Thus for the given data, we conclude that the optimal number of clusters for the data is 5.**

In [None]:
#Taking number of clusters = 5
kmeans = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10)
y_kmeans = kmeans.fit_predict(X)

In [None]:
# PLotting the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'yellow', label = 'Cluster-A')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'red', label = 'Cluster-B')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster-C')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'blue', label = 'Cluster-D')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'pink', label = 'Cluster-E')
plt.title('Clusters of Customers')
plt.xlabel('Annual income(k$)')
plt.ylabel('spending score')
plt.legend()
plt.show()

#### Taking only Age and Spending score as a feature

In [None]:
y = df.iloc[:, [2, 4]].values

In [None]:
wss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 50)
    kmeans.fit(y)
    wss.append(kmeans.inertia_)

plt.rcParams['figure.figsize'] = (13, 5)
plt.plot(range(1, 15), wss)
plt.title('K-Means Clustering(The Elbow Method)', fontsize = 20)
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid()
plt.show()

**Optimal Number of cluster is 4.**

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 50)
ymeans = kmeans.fit_predict(y)

plt.rcParams['figure.figsize'] = (30, 10)
plt.title('Cluster of Ages', fontsize = 30)

plt.scatter(y[ymeans == 0, 0], y[ymeans == 0, 1], s = 100, c = 'pink', label = 'Usual Customers' )
plt.scatter(y[ymeans == 1, 0], y[ymeans == 1, 1], s = 100, c = 'orange', label = 'Priority Customers')
plt.scatter(y[ymeans == 2, 0], y[ymeans == 2, 1], s = 100, c = 'lightgreen', label = 'Target Customers(Young)')
plt.scatter(y[ymeans == 3, 0], y[ymeans == 3, 1], s = 100, c = 'red', label = 'Target Customers(Old)')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 50, c = 'black')

plt.style.use('dark_background')
plt.xlabel('Age')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.grid()
plt.show()


#### Taking Age,spending score and Annual Income

In [None]:
x = df[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']].values
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 50)
km.fit(x)
labels = km.labels_
centroids = km.cluster_centers_

In [None]:
df['labels'] =  labels
trace1 = go.Scatter3d(
    x= df['Age'],
    y= df['Spending Score (1-100)'],
    z= df['Annual Income (k$)'],
    mode='markers',
     marker=dict(
        color = df['labels'], 
        size= 10,
        line=dict(
            color= df['labels'],
            width= 12
        ),
        opacity=0.8
     )
)
df = [trace1]

layout = go.Layout(
    title = 'Character vs Gender vs Alive or not',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Spending Score'),
            zaxis = dict(title  = 'Annual Income')
        
        )
)

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)