<a href="https://colab.research.google.com/github/sandestiny/E-commerce-Customer-Segmentation/blob/main/DW20_Project_work_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **E-Commerce Customer Segmentation** by Sham Hiruthik



### **Data Preparation**

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import metrics

In [None]:
os.chdir(r"/gdrive/MyDrive/Colab Notebooks/Problem Statement")

FileNotFoundError: ignored

In [None]:
os.listdir()

In [None]:
cust = pd.read_excel('cust_data.xlsx')
cust.head()

In [None]:
cust.info()

In [None]:
cust.dtypes.value_counts()

In [None]:
cust.shape

In [None]:
cust.drop(columns = ['Cust_ID'], inplace = True)

### **Checking missing values**

In [None]:
def missingValues(data):
    m=data.isnull().sum()
    md=(data.isnull().sum()/ data.shape[0]*100)
    df=pd.concat([m, md], axis=1, keys=["Missing", "Percentage"])
    fg=df.sort_values(by ="Percentage", ascending=False)
    fg1=fg[fg["Percentage"]>0]
    return fg1

In [None]:
missingValues(cust)

### **Missing values Treatment**

In [None]:
cust['Gender'].unique()

In [None]:
cust['Gender'].value_counts()

In [None]:
cust['Gender'].fillna('Other', inplace = True)

### **Dummy Creation**

In [None]:
cust1 = pd.get_dummies(cust, drop_first=True)

In [None]:
cust1.columns

### **Checking Outliers**

In [None]:
cust1.describe(percentiles = [0.01,0.02,0.03,0.04,0.05,0.25,0.5,0.75,0.95,0.96,0.97,0.98,0.99]).T

### **Outlier Treatment**

In [None]:
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.99))
    return x

In [None]:
cust2 = cust1.apply(lambda x : outlier_capping(x))

In [None]:
cust2.shape

In [None]:
cust2.columns

In [None]:
x = ['Jordan', 'Gatorade', 'Samsung', 'Asus', 'Udis',
       'Mondelez International', 'Wrangler', 'Vans', 'Fila', 'Brooks', 'H&M',
       'Dairy Queen', 'Fendi', 'Hewlett Packard', 'Pladis', 'Asics', 'Siemens',
       'J.M. Smucker', 'Pop Chips', 'Juniper', 'Huawei', 'Compaq', 'IBM',
       'Burberry', 'Mi', 'LG', 'Dior', 'Scabal', 'Tommy Hilfiger', 'Hollister',
       'Forever 21', 'Colavita', 'Microsoft', 'Jiffy mix', 'Kraft']
cust3 = cust2[x]

In [None]:
def myKmeans(data,k, gap, verbose=True, grp=True):
    wcss=[]
    cluster=[]
    for i in range(1,k+1, gap):
        cluster.append(i)
        km=KMeans(n_clusters=i)
        km_result=km.fit(data)
        intr=km_result.inertia_
        wcss.append(intr)
        if verbose:
            print(f"for cluster {i} ; WCSS = {round(intr, 2)}")
    if grp:
        plt.plot(cluster, wcss, label="within cluster sum of sqaure(wcss)")
        plt.xlabel("Clusters")
        plt.ylabel("Wcss")
        plt.legend()
        plt.show()
    return wcss, cluster

In [None]:
plt.figure(figsize = [10,7])
myKmeans(data=cust2, k=10, gap=1, verbose=True)


**n_clusters = 2**

In [None]:
km=KMeans(n_clusters=2)
km_result=km.fit(cust2)

In [None]:
km_result.labels_

In [None]:
cust2

In [None]:
cust4=cust2.copy()

In [None]:
cust4["cluster2"]=km_result.labels_
cust4.head()

In [None]:
cust4["cluster2"].value_counts()

In [None]:
clust2=cust4.groupby(["cluster2"]).mean().T
clust2.columns=["clust1", "clust2"]
clust2.reset_index(inplace=True)
clust2

In [None]:
overall=pd.DataFrame(cust2.describe().T["mean"])
overall.reset_index(inplace=True)
overall.columns=["index", "Overall_mean"]

In [None]:
final2=overall.merge(clust2, on="index", how="inner")
final2

### **Silhouette score 2**

In [None]:
score1 = metrics.silhouette_score(cust2, km_result.labels_)
print('Silhouette Score : % 0.3f' % score1)

In [None]:
cust4.columns

In [None]:
final2.to_excel("Cluster_2.xlsx" )

### **Silhouette Analysis**

In [None]:
! pip install yellowbrick

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
 
fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(cust2) 

**n_clusters = 3**

In [None]:
km=KMeans(n_clusters=3)
km_result=km.fit(cust2)

In [None]:
km_result.labels_

In [None]:
cust5=cust2.copy()

In [None]:
cust5["cluster3"]=km_result.labels_

In [None]:
cust5["cluster3"].value_counts()

In [None]:
clust3=cust5.groupby(["cluster3"]).mean().T
clust3.columns=["clust1", "clust2", "clust3"]
clust3.reset_index(inplace=True)
clust3

In [None]:
final3=overall.merge(clust3, on="index", how="inner")
final3

**Silhouette score 3**

In [None]:
score2 = metrics.silhouette_score(cust2, km_result.labels_)
print('Silhouette Score : % 0.3f' % score2)

In [None]:
final3.to_excel("Cluster_3.xlsx" )

**n_clusters = 4**

In [None]:
km=KMeans(n_clusters=4)
km_result=km.fit(cust2)

In [None]:
km_result.labels_

In [None]:
cust6=cust2.copy()

In [None]:
cust6["cluster4"]=km_result.labels_

In [None]:
cust6["cluster4"].value_counts()

In [None]:
clust4=cust6.groupby(["cluster4"]).mean().T
clust4.columns=["clust1", "clust2", "clust3", "clust4"]
clust4.reset_index(inplace=True)
clust4

In [None]:
final4=overall.merge(clust4, on="index", how="inner")
final4

**Silhouette score 4**

In [None]:
score3 = metrics.silhouette_score(cust2, km_result.labels_)
print('Silhouette Score : % 0.3f' % score3)

In [None]:
final4.to_excel("Cluster_4.xlsx" )