# Mall Customer Segmentation_Vineet

## Mall Customer Segmentation using Gaussian Mixture Model

### i)    Read dataset and rename columns appropriately
### ii)   Drop customerid column and also transform Gender column to [0,1]
### iii)  Use seaborn to understand each feature and relationships among features.
### iv)  Use sklearn's StandardScaler() to scale dataset
### v)   Perform clustering using Gaussian Mixture Modeling.
### vi)  Use aic and bic measures to draw a scree plot and discover ideal number of clusters
### viii) Lookup anomalous customers and try to understand their behavior.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer

import re
import warnings
warnings.filterwarnings("ignore")
import os

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
os.chdir('/kaggle/input/customer-segmentation-tutorial-in-python/')

In [None]:
os.listdir()

In [None]:
df=pd.read_csv("Mall_Customers.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape
df.dtypes

In [None]:
df.rename({'CustomerID':'Customer_ID',
           'Annual Income (k$)':'Annual_Income',
           'Spending Score (1-100)':'Spending_Score'},
           axis=1,
           inplace=True)

In [None]:
df.columns

In [None]:
df.drop(columns={'Customer_ID'}, inplace=True)

In [None]:
df.shape
df.columns

In [None]:
df.Gender.value_counts()

In [None]:
df.Gender[df.Gender == 'Male'] = 1
df.Gender[df.Gender == 'Female'] = 0
# Male=1, Female=0
df.head()
df.describe()

In [None]:
df["Age_cat"] = pd.cut(
                       df['Age'],
                       bins = [0,35,50,80],
                       labels= ["y", "m", "s"]
                      )

In [None]:
df["Annual_Income_cat"] = pd.cut(
                               df['Annual_Income'],
                               bins = [0,40,80,150],
                               labels= ["l", "m", "h"]
                               )

In [None]:
df["Spending_Score_cat"] = pd.cut(
                               df['Spending_Score'],
                               bins = 3,
                               labels= ["Ls", "Ms", "Hs"]
                               )

In [None]:
df.sample(n=10)

In [None]:
columns = ['Gender', 'Age', 'Annual_Income', 'Spending_Score']
fig = plt.figure(figsize = (10,10))
for i in range(len(columns)):
    plt.subplot(2,2,i+1)
    sns.distplot(df[columns[i]])

In [None]:
fig = plt.figure(figsize = (10,8))
sns.barplot(x = 'Gender',
            y = 'Spending_Score',
            hue = 'Age_cat',       # Age-cat wise plots
            estimator = np.mean,
            ci = 68,
            data =df)


In [None]:
sns.boxplot(x = 'Age',                 
            y = 'Spending_Score',
            data = df
            )


In [None]:
sns.boxplot(x = 'Annual_Income',
            y = 'Age', 
            data = df
            )

In [None]:
sns.jointplot(df.Age, df.Spending_Score,kind = "kde")

In [None]:
sns.jointplot(df.Age, df.Annual_Income,kind="hex")

In [None]:
sns.barplot(x = 'Annual_Income',
            y = 'Spending_Score',
            estimator = np.mean,
            ci = 95,
            data =df
            )


In [None]:
df.columns

In [None]:
grouped = df.groupby(['Gender', 'Age_cat'])
df_wh = grouped['Spending_Score'].sum().unstack()
df_wh

sns.heatmap(df_wh)

In [None]:
grouped = df.groupby(['Gender', 'Age_cat'])
df_wh = grouped['Annual_Income'].sum().unstack()
df_wh

sns.heatmap(df_wh)

In [None]:
grouped = df.groupby(['Age_cat','Spending_Score_cat'])
df_wq = grouped['Annual_Income'].sum().unstack()
sns.heatmap(df_wq, cmap = plt.cm.Spectral)

In [None]:
sns.catplot(x = 'Spending_Score',
            y = 'Age', 
            row = 'Spending_Score_cat',
            col = 'Age_cat' ,
            kind = 'box',
            estimator = np.sum,
            data = df)

In [None]:
sns.relplot(x = 'Annual_Income',
            y = 'Spending_Score', 
            col = 'Age_cat' ,
            kind = 'line',
            estimator = np.sum,
            data = df)

### Split Dataset 

In [None]:
df.dtypes
df.shape

In [None]:
y=df['Spending_Score'].values

In [None]:
num1=df.select_dtypes('int64').copy()

In [None]:
num1.shape
num1.head()

In [None]:
ss=StandardScaler()

In [None]:
ss.fit(num1)

In [None]:
X=ss.transform(num1)

In [None]:
X[:5,]

### Perform Clustering using GaussianMixtureModeling

In [None]:
gm=GaussianMixture(n_components=3,
                   n_init=10,
                   max_iter=100)

In [None]:
gm.fit(X)

In [None]:
gm.means_

In [None]:
gm.converged_

In [None]:
gm.n_iter_

In [None]:
gm.predict(X)

In [None]:
gm.weights_

In [None]:
np.unique(gm.predict(X), return_counts = True)[1]/len(X)

In [None]:
gm.sample()

In [None]:
fig=plt.figure()

In [None]:
plt.scatter(X[:,0],X[:,1],c=gm.predict(X),s=2)

In [None]:
plt.scatter(gm.means_[:, 0], gm.means_[:, 1],
            marker='v',
            s=5,               # marker size
            linewidths=5,      # linewidth of marker edges
            color='red'
            )
plt.show()

In [None]:
densities=gm.score_samples(X)

In [None]:
densities

In [None]:
density_threshold=np.percentile(densities,4)

In [None]:
density_threshold

### Discover ideal number of Clusters - Using aic and bic measures and Scree plot

In [None]:
bic = []
aic = []

In [None]:
for i in range(8):
    gm = GaussianMixture(
                     n_components = i+1,
                     n_init = 10,
                     max_iter = 100)
    gm.fit(X)
    bic.append(gm.bic(X))
    aic.append(gm.aic(X))

In [None]:
fig = plt.figure()
plt.plot([1,2,3,4,5,6,7,8], aic)
plt.plot([1,2,3,4,5,6,7,8], bic)
plt.show()

In [None]:
tsne = TSNE(n_components = 2)
tsne_out = tsne.fit_transform(X)
plt.scatter(tsne_out[:, 0], tsne_out[:, 1],
            marker='x',
            s=50,              # marker size
            linewidths=5,      # linewidth of marker edges
            c=gm.predict(X)   # Colour as per gmm
            )

### Anomalies Behaviour observation..

In [None]:
anomalies=X[densities<density_threshold]

In [None]:
anomalies

In [None]:
plt.scatter(X[:, 0], X[:, 1], c = gm.predict(X))

In [None]:
plt.scatter(anomalies[:, 0], anomalies[:, 1],
            marker='x',
            s=50,               # marker size
            linewidths=5,      # linewidth of marker edges
            color='red'
            )
plt.show()

In [None]:
unanomalies = X[densities >= density_threshold]

In [None]:
unanomalies.shape

In [None]:
df_anomalies = pd.DataFrame(anomalies, columns = ['w','x', 'y'])
df_anomalies['z'] = 'anomalous'   # Create a IIIrd constant column
df_normal = pd.DataFrame(unanomalies, columns = ['w','x','y'])
df_normal['z'] = 'unanomalous'

In [None]:
sns.distplot(df_anomalies['w'])
sns.distplot(df_normal['w'])

In [None]:
df = pd.concat([df_anomalies,df_normal])

In [None]:
sns.boxplot(x = df['z'], y = df['x'])
sns.boxplot(x = df['z'], y = df['w'])

## - Thanks...