### Importing packages

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering

## Importing Data  

In [None]:
data = pd.read_csv('../input/ecommerce-data/data.csv')
print(f'The shape of the data is {data.shape}')
data.head(10)

> *The dataset is a collection of details of products purchased in transactions(identified by invoice no.) by several customers in an e-commerce website. 
It consist of 8 columns corresponding to eac product bought in the e-commerce website. For each invoice no.(transaction) , there are multiple rows identifing each product in that transaction. The stock code is the code no of the product, it will be used to identify the product. Description is the product description and quantity and unitPrice are the no. of units and price of each unit of that product. InvoiceDate consist of date and time of purchase and CustomerID will be used to identify each customer uniquely.*

In [None]:
data.info()

In [None]:
data.isna().sum(axis=0).sort_values(ascending=False)

In [None]:
data.dropna(subset=['CustomerID','Description'],how = 'any', inplace = True)
print(f'The shape of the data is {data.shape}')
print("Null Values in each Column:")
data.isna().sum(axis=0).sort_values(ascending=False)

In [None]:
print(f"The number of duplicates in the dataset is {data.duplicated(keep='first').sum(axis = 0)}")
data.drop_duplicates(inplace=True)
print(f"Duplicates Dropped...\nThe number of duplicates in the dataset now is {data.duplicated(keep='first').sum(axis = 0)}")

#### In the above code all the rows missing the value of CustomerID and the description of the products are identified and dropped.
#### Also the duplicates are searched among the data and only a single copy is backed.

In [None]:
data = data[data['Quantity'] > 0]
print(f'The shape of the data is {data.shape}')

In [None]:
print('Number of transaction: ',data['InvoiceNo'].nunique())
print('Number of unique products: ',data['StockCode'].nunique())
print('Number of customers: ',data['CustomerID'].nunique())

The given dataset consist of information which cannot be used as a feature for clustering customers based on the purchases.
Hence we do some feature engineering to generate some useful and direct features from the given dataset. Here are the features: 
1. Recency: One of the most important feature of customer segmentation is when the last transaction was done in the e-commerce website. This helps in targetting customers who have recently transacted in the e-commerce website and hence would segragate all the inactive customers, helping to market them differently. Recency is calculated by grouping all the transaction done by a customer and finding the difference between the current date and the date of latest transaction. For this the dataset's datetime format is made compatible for operation. Recency is measured by no. of days

2. Frequency: Frequency is the number of times the customer has done transaction in the e-commerce website. This helps in separating regular customers from one-time customers and marketing them appropriately.

3. Monetary: Monetary determines the amount spent altogether by the customers altogether on the transactions made in the website. This helps in identifying customers who are spend-thrift or have the capacity of spending a good amount on costly products in the future transaction from customers who have purchased goods of less amount from the e-commerce website and hence would help them market differently once segmented.

4. AvgQuantity: AvgQuantity determines the average number of products(quantity-wise) purchased by the customer in each transaction. This helps in keeping track of customers who purchase goods in large quantities and marketing them appropriately.

5. OldCust: OldCust determines since how long the customers has been purchasing from the e-commerce website. Using this feature would help identifying old customers from comparitively newer ones.

In [None]:
data['InvoiceDate'] = data['InvoiceDate'].astype('datetime64')
print(data['InvoiceDate'].max())

In [None]:
now = dt.date(2011,12,9)
print(f'Date of Reference: {now}')
data['Date'] = data['InvoiceDate'].apply(lambda x: x.date())

In [None]:
data.head()

In [None]:
recency_df = data.groupby(by='CustomerID', as_index=False)['Date'].max()
recency_df.columns = ['CustomerID','LastPurshaceDate']
recency_df['Recency'] = recency_df['LastPurshaceDate'].apply(lambda x: (now - x).days)
recency_df.drop('LastPurshaceDate',axis=1,inplace=True)
recency_df.head()

In [None]:
temp = data.copy()
temp.drop_duplicates(['InvoiceNo','CustomerID'],keep='first',inplace=True)
frequency_df = temp.groupby(by=['CustomerID'], as_index=False)['InvoiceNo'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head()

In [None]:
data['TotalCost'] = data['Quantity'] * data['UnitPrice']
data.head(10)

In [None]:
monetary_df = data.groupby(by='CustomerID',as_index=False).agg({'TotalCost': 'sum'})
monetary_df.columns = ['CustomerID','Monetary']
monetary_df.head()

In [None]:
quantity_df = data.groupby(by='CustomerID',as_index=False)['Quantity'].mean()
quantity_df.columns = ['CustomerID','AvgQuantity']
quantity_df.head()

In [None]:
oldcust_df = data.groupby(by='CustomerID', as_index=False)['Date'].min()
oldcust_df.columns = ['CustomerID','FirstPurshaceDate']
oldcust_df['OldCust'] = oldcust_df['FirstPurshaceDate'].apply(lambda x: (now - x).days)
oldcust_df.drop('FirstPurshaceDate',axis=1,inplace=True)
oldcust_df.head()

In [None]:
customer_data = recency_df.merge(frequency_df,on='CustomerID').merge(monetary_df,on='CustomerID').merge(quantity_df,on='CustomerID').merge(oldcust_df,on='CustomerID')
customer_data.set_index('CustomerID',inplace=True)
customer_data.head()

In [None]:
sns.pairplot(customer_data,diag_kind='kde')

In [None]:
sns.heatmap(customer_data.corr(),annot=True)

In [None]:
mean_cd = customer_data.mean()
std_cd = customer_data.std()
customer_data = (customer_data - customer_data.mean())/customer_data.std()
customer_data.head()

## Hierarchical Clustering

### Single Aggromerative Clustering

In [None]:
X_sig = customer_data.copy()
Y_sig = X_sig.to_numpy()
sig_agg = AgglomerativeClustering(linkage='single',affinity='euclidean', n_clusters=5)
model_sig = sig_agg.fit(X_sig)
Z_sig = linkage(X_sig,'single')
fig_sig = plt.figure(figsize=(10, 7))
dn_sig = dendrogram(Z_sig)
X_sig['Label']=model_sig.labels_+1
c_sig, coph_dists_sig = cophenet(Z_sig, pdist(Y_sig))
print(f'Cophenetic Correlation Value: {c_sig}')
plt.show()

### Complete Aggromerative Clustering

In [None]:
X_comp = customer_data.copy()
Y_comp = X_comp.to_numpy()
comp_agg = AgglomerativeClustering(linkage='complete',affinity='euclidean', n_clusters=5)
model_comp = comp_agg.fit(X_comp)
Z_comp = linkage(X_comp,'complete')
fig_comp = plt.figure(figsize=(10, 7))
dn_comp = dendrogram(Z_comp)
X_comp['Label']=model_comp.labels_+1
c_comp, coph_dists_comp = cophenet(Z_comp, pdist(Y_comp))
print(f'Cophenetic Correlation Value: {c_comp}')
plt.show()

### Average Aggromerative Clustering

In [None]:
X_avg = customer_data.copy()
Y_avg = X_avg.to_numpy()
avg_agg = AgglomerativeClustering(linkage='average',affinity='euclidean', n_clusters=5)
model_avg = avg_agg.fit(X_avg)
Z_avg = linkage(X_avg,'average')
fig_avg = plt.figure(figsize=(10, 7))
dn_avg = dendrogram(Z_avg)
X_avg['Label']=model_avg.labels_+1
c_avg, coph_dists_avg = cophenet(Z_avg, pdist(Y_avg))
print(f'Cophenetic Correlation Value: {c_avg}')
plt.show()

### Ward Aggromerative Clustering

In [None]:
X_ward = customer_data.copy()
Y_ward = X_ward.to_numpy()
ward_agg = AgglomerativeClustering(linkage='ward',affinity='euclidean', n_clusters=5)
model_ward = ward_agg.fit(X_ward)
Z_ward = linkage(X_ward,'ward')
fig_ward = plt.figure(figsize=(10, 7))
dn_ward = dendrogram(Z_ward)
X_ward['Label']=model_ward.labels_+1
c_ward, coph_dists_ward = cophenet(Z_ward, pdist(Y_ward))
print(f'Cophenetic Correlation Value: {c_ward}')
plt.show()

## K-means clustering

In [None]:
X = customer_data.copy()
cluster_range = range(1, 15)
cluster_errors = []
cluster_sil_scores = []

for num_clusters in cluster_range:
    clusters = KMeans( num_clusters, n_init = 100,init='k-means++',random_state=0)
    clusters.fit(X)
    labels = clusters.labels_
    centroids = clusters.cluster_centers_
    cluster_errors.append(clusters.inertia_)
    
clusters_df = pd.DataFrame({ "num_clusters":cluster_range, "cluster_errors": cluster_errors} )
clusters_df[0:10]

In [None]:
plt.figure(figsize=(12,6))
plt.plot(clusters_df['num_clusters'], clusters_df['cluster_errors'], marker = "o" )
plt.xlabel('Number of Clusters')
plt.ylabel('Cluster Errors')

In [None]:
for k in range(2,10):
    cluster = KMeans(n_clusters=k, random_state=0)
    labels = cluster.fit_predict(X)
    
    sil_avg = silhouette_score(X, labels)
    print('For',k,'clusters, average silhoutte score =',sil_avg)

In [None]:
customer_data = recency_df.merge(frequency_df,on='CustomerID').merge(monetary_df,on='CustomerID').merge(quantity_df,on='CustomerID').merge(oldcust_df,on='CustomerID')
customer_data.set_index('CustomerID',inplace=True)
customer_data.head()
mean_cd = customer_data.mean()
std_cd = customer_data.std()
customer_data = (customer_data - customer_data.mean())/customer_data.std()
customer_data.head()

In [None]:
# With num_clusters = 5
X = customer_data.copy()
clusters = KMeans(5, n_init = 100,init='k-means++',random_state=42)
clusters.fit(X)
labels = clusters.labels_
centroids = clusters.cluster_centers_
X['Label'] = labels+1

In [None]:
centroids

In [None]:
mean_cd = mean_cd.values[:]
std_cd = std_cd.values[:]

In [None]:
cent_vals = (centroids * std_cd) + mean_cd
cent_vals = cent_vals.astype('int32')

In [None]:
cent_vals

In [None]:
cluster_count = X['Label'].value_counts()

In [None]:
for i in range(1,6):
    print(f"Cluster {i}:")
    print(f"The number of customers in this cluster is {cluster_count[i]}.")
    print(f"Centroid(Mean) Features of Cluster {i}:\n ")
    print(f"Recency = {cent_vals[i-1][0]} \nFrequency = {cent_vals[i-1][1]} \nMonetary = {cent_vals[i-1][2]} \nAvgQuantity = {cent_vals[i-1][3]} \nOldCust = {cent_vals[i-1][4]} \n\n\n")

>From the above output we can infer that, we can cluster our customers in 5 clusters:

>Cluster 1: It includes recent customers(Based on recency) who have only purchased once with low amount of purchase. They don't on an average purchase in large quantities. The e-commerce website shall target them such that they come to the e-commerce website again. As they are recent customers they can be potential customers based on there experience of first purchase. 