In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Step 1: Import and Examine the data**

In [None]:
# read the data to pandas dataframe

retail = pd.read_csv('../input/online-retail-customer-clustering/OnlineRetail.csv', sep=",", encoding="ISO-8859-1", header=0)
retail.head()

In [None]:
# shape of df

retail.shape

In [None]:
# df info

retail.info()

In [None]:
type_counts = retail['Country'].value_counts()
Country=pd.DataFrame(type_counts)
Country.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,10))
Country=Country.head()
ax = sns.barplot(y='Country',x=Country.index, data=Country.head())
plt.xticks(rotation=45)

In [None]:
retail=retail[retail['Country']=='Germany']
retail.shape

# **Step 2: Data Cleaning**

In [None]:
# Calculating the Missing Values % contribution in DF

df_null = round(100*(retail.isnull().sum())/len(retail), 2)
df_null

In [None]:
# Droping rows having missing values

retail = retail.dropna()
retail.shape

In [None]:
# Changing the datatype of Customer Id as per Business understanding

retail['CustomerID'] = retail['CustomerID'].astype(str)

# **Step 3: Data Preparation for RFM Factors**

1. Calculating Recency

In [None]:
# New Attribute : Recency

# Convert to datetime to proper datatype

retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'],format='%d-%m-%Y %H:%M')

In [None]:
# Compute the maximum date to know the last transaction date

max_date = max(retail['InvoiceDate'])
max_date

In [None]:
# Compute the difference between max date and transaction date

retail['Diff'] = max_date - retail['InvoiceDate']
retail.head()

In [None]:
# Compute last transaction date to get the recency of customers

rfm_r = retail.groupby('CustomerID')['Diff'].min().reset_index()
rfm_r.head()

In [None]:
# Extract number of days only

rfm_r['Diff'] = rfm_r['Diff'].dt.days
rfm_r.columns = ['CustomerID','Recency']
rfm_r.head()

2. Calculating Frequency

In [None]:
### New Attribute : Frequency

rfm_f = retail.groupby('CustomerID')['InvoiceNo'].count().reset_index()
rfm_f.columns = ['CustomerID', 'Frequency']
rfm_f.head()

3. Calculating Monetary

In [None]:
# New Attribute : Monetary

retail['Amount'] = retail['Quantity']*retail['UnitPrice']
rfm_m = retail.groupby('CustomerID')['Amount'].sum().reset_index()
rfm_m.head()

In [None]:
rfm = rfm_r.merge(rfm_f,how='inner',on=['CustomerID'])
rfm =rfm.merge(rfm_m,how='inner',on=['CustomerID'])
rfm.head()

4. Remove Outliers

In [None]:
# Outlier Analysis of Amount Frequency and Recency
import matplotlib.pyplot as plt
import seaborn as sns

attributes = ['Recency','Frequency','Amount',]
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = rfm[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

In [None]:
# Removing (statistical) outliers for Amount
Q1 = rfm.Amount.quantile(0.05)
Q3 = rfm.Amount.quantile(0.95)
IQR = Q3 - Q1
rfm = rfm[(rfm.Amount >= Q1 - 1.5*IQR) & (rfm.Amount <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Recency
Q1 = rfm.Recency.quantile(0.05)
Q3 = rfm.Recency.quantile(0.95)
IQR = Q3 - Q1
rfm = rfm[(rfm.Recency >= Q1 - 1.5*IQR) & (rfm.Recency <= Q3 + 1.5*IQR)]

# Removing (statistical) outliers for Frequency
Q1 = rfm.Frequency.quantile(0.05)
Q3 = rfm.Frequency.quantile(0.95)
IQR = Q3 - Q1
rfm = rfm[(rfm.Frequency >= Q1 - 1.5*IQR) & (rfm.Frequency <= Q3 + 1.5*IQR)]

5. Rescaling the Attributes by Standardisation (mean-0, sigma-1)

In [None]:
# Rescaling the attributes
import sklearn
from sklearn.preprocessing import StandardScaler

rfm_df = rfm[['Recency','Frequency', 'Amount']]

# Instantiate
scaler = StandardScaler()

# fit_transform
rfm_df_scaled = scaler.fit_transform(rfm_df)
rfm_df_scaled.shape

In [None]:
rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']
rfm_df_scaled.head()

# **Step 3: K-Means Clustering**

1. Initial Cluster Given K

In [None]:
# k-means with some arbitrary k
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_df_scaled)
# assign the label
rfm['Cluster_Id'] = kmeans.labels_
rfm.head()

In [None]:
### visualize the result
import plotly.express as px
rfm["Cluster_Id"] = rfm["Cluster_Id"].astype(str) #convert to string
fig = px.scatter_3d(rfm, x='Recency', y='Frequency', z='Amount',
              color='Cluster_Id')
fig.show()

**2. Finding the best K: A fundamental step for any unsupervised algorithm is to determine the optimal number of clusters into which the data may be clustered. **

Method 1: Finding the elbow point for (inertia_) "Sum of squared distances of samples to their closest cluster center".

In [None]:
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(
    model, k=(2,9), metric='distortion')

visualizer.fit(rfm_df_scaled)        # Fit the data to the visualizer
visualizer.show() 

In [None]:
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(
    model, k=(2,9), metric='silhouette')

visualizer.fit(rfm_df_scaled)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(
    model, k=(2,9), metric='calinski_harabasz')

visualizer.fit(rfm_df_scaled)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
# k-means with some arbitrary k
k=4
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, max_iter=50)
kmeans.fit(rfm_df_scaled)
# assign the label
rfm['Cluster_Id'] = kmeans.labels_
rfm.head()

In [None]:
### visualize the result
import plotly.express as px
rfm["Cluster_Id"] = rfm["Cluster_Id"].astype(str) #convert to string
fig = px.scatter_3d(rfm, x='Recency', y='Frequency', z='Amount',
              color='Cluster_Id')
fig.show()

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Id', y='Recency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Id', y='Frequency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Id', y='Amount', data=rfm)

In [None]:
Target_Customer = rfm[rfm['Cluster_Id']=='1']
Target_Customer.head()

In [None]:
Target_Customer.count()

# **Step 4: Hierarchical Clustering**

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

1. Visualize Tree by Linkage Methods

In [None]:
# Single linkage: 

mergings = linkage(rfm_df_scaled, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# Complete linkage

mergings = linkage(rfm_df_scaled, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# Average linkage

mergings = linkage(rfm_df_scaled, method="average", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
# 3 clusters
k=4
cluster_labels = cut_tree(mergings, n_clusters=k).reshape(-1, )
rfm['Cluster_Labels'] = cluster_labels
rfm.head()

In [None]:
### visualize the result
import plotly.express as px
rfm["Cluster_Labels"] = rfm["Cluster_Labels"].astype(str) #convert to string
fig = px.scatter_3d(rfm, x='Recency', y='Frequency', z='Amount',
              color='Cluster_Labels')
fig.show()

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Labels', y='Recency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Labels', y='Frequency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Frequency
import seaborn as sns

sns.boxplot(x='Cluster_Labels', y='Amount', data=rfm)

In [None]:
Target_Customer2 = rfm[rfm['Cluster_Labels']=='2']
Target_Customer2.head()

In [None]:
Target_Customer2.count()