In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Data Set Information:

This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.


Attribute Information:

- `InvoiceNo`: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.

- `StockCode`: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.

- `Description`: Product (item) name. Nominal.

- `Quantity`: The quantities of each product (item) per transaction. Numeric.

- `InvoiceDate`: Invice Date and time. Numeric, the day and time when each transaction was generated.

- `UnitPrice`: Unit price. Numeric, Product price per unit in sterling.

- `CustomerID`: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.

- `Country`: Country name. Nominal, the name of the country where each customer resides.


Link to the dataset: https://archive.ics.uci.edu/ml/datasets/online+retail



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/kaggle/input/ecommerce-data/data.csv")
df.head(10)

In [None]:
df.info()

### Data Preprocessing

- Remove the row entries containing the cancelling orders or invoices
- Remove the rows/columns containing the null or missing values
- Remove duplicates
- Convert the `CustomerID` values to integer (from float) followed by categorical values.

In [None]:
import re
df['InvoiceNo'].str.contains('C', flags = re.IGNORECASE, regex = True)

In [None]:
df[df['InvoiceNo'].str.contains('C', flags = re.IGNORECASE, regex = True)]

In [None]:
idx_cancelled_invoices = df[df['InvoiceNo'].str.contains('C', flags = re.IGNORECASE, regex = True)].index
idx_cancelled_invoices

In [None]:
df = df[~(df['InvoiceNo'].str.contains('C', flags = re.IGNORECASE, regex = True))]
df

In [None]:
df.isnull().sum()

In [None]:
print(f"Before Dropping CustomerID Null Rows:\nNumber of rows = {df.shape[0]}\nNumber of cols = {df.shape[1]}")
df.dropna(inplace = True)
print(f"\nAfter Dropping CustomerID Null Rows:\nNumber of rows = {df.shape[0]}\nNumber of cols = {df.shape[1]}")

In [None]:
df.isnull().sum()

In [None]:
print(f"Before Dropping Duplicates:\nNumber of rows = {df.shape[0]}\nNumber of cols = {df.shape[1]}")
df.drop_duplicates(inplace = True)
print(f"\nAfter Dropping Duplicates:\nNumber of rows = {df.shape[0]}\nNumber of cols = {df.shape[1]}")

In [None]:
df.head(10)

In [None]:
df['CustomerID'] = df['CustomerID'].astype('int64').astype('category')
df['CustomerID'].dtype

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceDate'].dtype

In [None]:
df.describe()

### Extract Recency, Frequency, Monetary (RFM) Metrics

Group the data frame by customer ID separately and aggregate the `InvoiceDate`, `Quantity` and `UnitPrice` columns.

In [None]:
df['TotalPurchaseValue'] = df['Quantity'] * df['UnitPrice']
df.head(10)

In [None]:
customer_wise_total_purchase_value_df = df[['CustomerID', 'TotalPurchaseValue']].groupby('CustomerID', as_index = False).sum()
customer_wise_total_purchase_value_df.rename(columns = {'TotalPurchaseValue' : 'Monetary'}, inplace = True)
customer_wise_total_purchase_value_df

In [None]:
customer_wise_frequent_purchases_df =  df[['CustomerID', 'InvoiceNo']].groupby('CustomerID', as_index = False).count()
customer_wise_frequent_purchases_df.rename(columns = {'InvoiceNo': 'Frequency'}, inplace = True)
customer_wise_frequent_purchases_df

In [None]:
merged_df = customer_wise_total_purchase_value_df.merge(customer_wise_frequent_purchases_df, on = "CustomerID", how = "inner")
merged_df

In [None]:
last_purchase_df = df[['CustomerID', 'InvoiceDate']].groupby('CustomerID', as_index = False).max()
last_purchase_df.rename(columns = {'InvoiceDate': 'LastPurchaseDate'}, inplace = True)
last_purchase_df

In [None]:
days_since_last_purchase = df['InvoiceDate'].max() - last_purchase_df['LastPurchaseDate'] 
days_since_last_purchase = days_since_last_purchase + pd.Timedelta("1 days")
days_since_last_purchase

In [None]:
time_diff_in_days = pd.Series(data = [d.days for d in days_since_last_purchase], index = merged_df.index)
time_diff_in_days

In [None]:
merged_df['Recency'] = time_diff_in_days
merged_df

In [None]:
merged_df.isnull().sum()

In [None]:
merged_df.dropna(inplace = True)
merged_df.drop(columns = 'CustomerID', inplace = True)

In [None]:
merged_df.isna().sum()

### Data Visualisation

Create histograms and box plots for RFM features to evaluate the data distribution within each features.

In [None]:
fig, axis = plt.subplots(nrows = 2, ncols = 3, 
                         figsize = (15, 4), dpi = 100,
                         sharex = False, sharey = False,
                         gridspec_kw = {'height_ratios': [3, 1]}
                         )

# Monetary
axis[0, 0].hist(merged_df['Monetary'], bins = 'sturges', facecolor = 'red', edgecolor = 'black')
sns.boxplot(x = 'Monetary', data = merged_df,  color = 'red', ax = axis[1, 0])
axis[0, 0].set_title("Histogram & Boxplot for Monetary")

# Frequency
axis[0, 1].hist(merged_df['Frequency'], bins = 'sturges', facecolor = 'green', edgecolor = 'black')
sns.boxplot(x = 'Frequency', data = merged_df,  color = 'green', ax = axis[1, 1])
axis[0, 1].set_title("Histogram & Boxplot for Frequency")

# Recency
axis[0, 2].hist(merged_df['Recency'], bins = 'sturges', facecolor = 'purple', edgecolor = 'black')
sns.boxplot(x = 'Recency', data = merged_df,  color = 'purple', ax = axis[1, 2])
axis[0, 2].set_title("Histogram & Boxplot for Recency")

plt.show()

`Monetary` and `Frequency` columns contain highly skewed values which indicates possibility of outliers.

Treating likely outliers in the `Monetary` column with the inter-quartile range (IQR) metric.

In [None]:
def treating_outliers(df, col):
  col_q1 = df[col].quantile(0.25)
  col_q3 = df[col].quantile(0.75)
  col_iqr = col_q3 - col_q1
  new_df = df[(df[col] >= col_q1 - 1.5 * col_iqr) & (df[col] <= col_q3 + 1.5 * col_iqr)]
  return new_df

new_df = treating_outliers(merged_df, 'Monetary')
new_df = new_df.reset_index(drop = True)
new_df

In [None]:
fig, axis_mon = plt.subplots(nrows = 2, ncols = 1, 
                         figsize = (10, 4), dpi = 100,
                         sharex = False, sharey = False,
                         gridspec_kw = {'height_ratios': [3, 1]}
                         )

# Monetary
axis_mon[0].hist(new_df['Monetary'], bins = 'sturges', facecolor = 'red', edgecolor = 'black')
sns.boxplot(x = 'Monetary', data = new_df,  color = 'red', ax = axis_mon[1])
axis_mon[0].set_title("Histogram & Boxplot for Monetary")

plt.show()

### Feature Scaling

Scaling the RFM features using the standard scaling technique to converge to global centroids faster using the KMeans clustering algorithm.

In [None]:
# standardise all parameters
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
norm_new = standard_scaler.fit_transform(new_df)
norm_new_df = pd.DataFrame(norm_new)
norm_new_df.columns = new_df.columns
norm_new_df

In [None]:
norm_new_df.describe().loc[['mean', 'std'], :]

Mean and standard deviation of values in the RFM features after scaling are 0 and 1 respectively.

### Cluster Tendency

Measuring the cluster tendency of the RFM features using the Hopkins Statistics method.

- If the value is between ${0.01, \dots, 0.3}$, the data is regularly spaced.

- If the value is around $0.5$, it is random.

- If the value is between ${0.7, \dots, 0.99}$, it has a high tendency to cluster.

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

hopkins(norm_new_df)

The Hopkins Statistics value is between 0.7 and 0.99. Hence, there is a higher tendency of RFM features towards clustering.

### Building KMeans Model

- Randomly choosing 5 clusters
- Initialising centriods with `k-means++` approach

In [None]:
# K Means with K = 5: chosen randomly.
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters = 5, init = 'k-means++', random_state = 14)
kmeans_model.fit(norm_new_df)

cluster_labels = pd.Series(data = kmeans_model.labels_, index = norm_new_df.index)
cluster_labels.value_counts()

In [None]:
# analysis of clusters formed
km_df = pd.concat([new_df, cluster_labels], axis = 1)
km_df.columns = list(new_df.columns) + ['ClusterLabel']
km_df

In [None]:
import plotly.express as px
# df = px.data.iris()
plotly_fig = px.scatter_3d(km_df, x = 'Monetary', y = 'Frequency', z = 'Recency', color = 'ClusterLabel')
plotly_fig.show()

### Silhouette Analysis

$$\text{silhouette score}=\frac{p-q}{max(p,q)}$$

$p$ is the mean distance to the points in the nearest cluster that the data point is not a part of

$q$ is the mean intra-cluster distance to all the points in its own cluster.

* The value of the silhouette score range lies between -1 to 1. 

* A score closer to 1 indicates that the data point is very similar to other data points in the cluster, 

* A score closer to -1 indicates that the data point is not similar to the data points in its cluster.

In [None]:
from sklearn.metrics import silhouette_score
sse_ = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters = k).fit(norm_new_df)
    sse_.append([k, silhouette_score(norm_new_df, kmeans.labels_)])
    
plt.plot(pd.DataFrame(sse_)[0], pd.DataFrame(sse_)[1])

As the number of clusters increase, the similarity of points within a cluster decrease. Hence, Ideally we should chose 4 clusters to be formed.

### Rebuilding KMeans Model

In [None]:
# Rebuilding KMeans model with 4 clusters.
kmeans_model2 = KMeans(n_clusters = 4, init = 'k-means++', random_state = 14)
kmeans_model2.fit(norm_new_df)

cluster_labels2 = pd.Series(data = kmeans_model2.labels_, index = norm_new_df.index)
cluster_labels2.value_counts()

In [None]:
# analysis of clusters formed
km_df2 = pd.concat([new_df, cluster_labels2], axis = 1)
km_df2.columns = list(new_df.columns) + ['ClusterLabel']
km_df2

In [None]:
import plotly.express as px
# df = px.data.iris()
plotly_fig2 = px.scatter_3d(km_df2, x = 'Monetary', y = 'Frequency', z = 'Recency', color = 'ClusterLabel')
plotly_fig2.show()