In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.stats import zscore
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster import AgglomerativeClustering

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',200)

In [None]:
df=pd.read_csv('../input/ecommerce-data/data.csv',encoding = 'ISO-8859-1')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

Droping the rows that have missing values in CustomerID column because this rows seems not to be useful for segmenting customer as imputation of CustomerID won't work efficiently.

In [None]:
df=df.drop(df[df['CustomerID'].isna()==True].index,axis=0)

In [None]:
df.shape

In [None]:
df.isna().sum()  #rechecking missing values

# Data Analysis

In [None]:
df['Country'].value_counts()
# Maximum orders are coming from UK

In [None]:
print(df['Country'].unique())
print('Total no. of countries from where customers belong: ',df['Country'].nunique())

In [None]:
print('Total no. of customers: ',df['CustomerID'].nunique())
print('Total transactions done: ',df['InvoiceNo'].nunique())
print('Products sold are : ',df['StockCode'].nunique())

In [None]:
# Need to check the cancelled orders as well as they are of not use for customer segmentation,
#'C'mentioned before the Invoiceno indicates that the order is cancelled
df[df['InvoiceNo'].apply(lambda x: x[0]=='C')]

In [None]:
percent_transaction_cancelled = round((df[df['InvoiceNo'].apply(lambda x: x[0]=='C')]['InvoiceNo'].nunique()/ df['InvoiceNo'].nunique())*100,2)
print('Percentage of Transactions cancelled are : ',percent_transaction_cancelled)

Droping rows that contains cancelled order as it won't be helpful for customer segmentation.

In [None]:
df=df.drop(df[df['InvoiceNo'].apply(lambda x: x[0]=='C')].index,axis=0)
df.shape

Creating new columns like Amount (total amount of purchase per customerID), Frequency (no. of times customer visiting the webpage) and Recency(how many days before customer did a transaction from today).

In [None]:
df['Amount'] = df['Quantity'] * df['UnitPrice']

In [None]:
a=df.groupby('CustomerID').sum()['Amount']
a= a.reset_index()

In [None]:
a.head()

In [None]:
b= df.groupby('CustomerID')['InvoiceNo'].count()
b= b.reset_index()
b.columns = ['CustomerID', 'Frequency']

In [None]:
b.head()

In [None]:
df1= pd.merge(a, b, on='CustomerID', how='inner')
df1.head()

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
max_date = df['InvoiceDate'].max()
max_date

In [None]:
df['days_diff'] = max_date - df['InvoiceDate']
df.head()

In [None]:
c = df.groupby('CustomerID')['days_diff'].min()
c = c.reset_index()
c.head()

In [None]:
c['days_diff'] = c['days_diff'].dt.days
c.head()

In [None]:
df1 = pd.merge(df1, c, on='CustomerID', how='inner')
df1.columns = ['CustomerID', 'Amount', 'Frequency', 'Recency']
df1.head()

In [None]:
df2=df1[['Amount','Frequency','Recency']]   # using only Amount, Frequency and Recency to find the customer segments.

In [None]:
df1_scaled = df2.apply(zscore)    # scaling of data is required as all the calculations is based on distance
df1_scaled.head()

# K-Means Clustering

In [None]:
kmeans = KMeans(random_state=2)
kmeans.fit(df1_scaled)

In [None]:
cluster_range = range( 1, 15 )
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans( num_clusters, n_init = 10 )
    clusters.fit(df1_scaled)
    cluster_errors.append( clusters.inertia_ )
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:15]

In [None]:
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )

From the cluster table above, the drop of error is low from cluster no. 3 which is also evident from elbow plot, thus we can expect our customers are divided into 3 segments.

In [None]:
kmeans = KMeans(n_clusters=3, n_init = 15, random_state=2)
kmeans.fit(df1_scaled)
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns = list(df1_scaled) )
centroid_df

In [None]:
df_labels = pd.DataFrame(kmeans.labels_ , columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')

In [None]:
df_kmeans = df1.join(df_labels)
df_kmeans.head()

In [None]:
sns.pairplot(df_kmeans,diag_kind='kde',hue='labels')

# Hierarchical Clustering

In [None]:
Z = linkage(df1_scaled, method='ward',metric='euclidean')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

Dendrogram Truncation

In [None]:
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=12,  # show only the last p merged clusters
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
)
plt.show()

In [None]:
hie_clus = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
labels = hie_clus.fit_predict(df1_scaled)

df_h = df1.copy(deep=True)
df_h['label'] = labels
df_h['label']=df_h['label'].astype('category')
df_h.head()

In [None]:
sns.pairplot(df_h,diag_kind='kde',hue='label')

# Inferences

From the centroids of Kmeans clustering:
1. Customers belonging to cluster 0 are the ones who recently visited but have moderate frequency to visit the webpage and purchase products  of moderate amount.
2. Customers belonging to cluster 1 are the ones who haven't visited the webpage recently, frequency is the lowest with least transaction amount.
3. Customers belonging to cluster 2 are the ones seems to be loyal customers as they are frequently visiting the webpage with the huge transaction amount.

From Hierarchical clustering:
1. Customers belonging to cluster 0 are loyal customers which are same as Kmeans cluster 2.
2. Customers belonging to cluster 1 are the potential customers sam as Kmeans cluster 0.
3. Customers belonging to cluster 2 are same as Kmeans cluster 1. 

Thus focus must be more on retaining the customers belonging to cluster 2 (kmeans) and cluster 0 (hierarchical) providing better services and discounts. Also customers belonging to cluster 0 (kmeans) and cluster 1 (hierarchical) are the potential customers who can convert into loyal cutsomers.For the same, a survey form could be send to those customers to understand their needs and preferences that can be work upon.