In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# To handle datetime data-type
import time, warnings
import datetime as dt

#visualizations
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
%matplotlib inline
import seaborn as sns #easy to use, awesome

# !!! add this to your first cell at the END of YOUR WORK !!!
# if you are sure your code is correct, it is beneficial to ignore warnings
warnings.filterwarnings("ignore")


In [None]:
# after adding data to Kaggle Input directory, import it to notebook
retail_df = pd.read_csv('/kaggle/input/onlineretail/OnlineRetail.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceID': str})
retail_df.head()

In [None]:
# creating a new column "Amount" by multiplying "Quantity" and "Unit Price"
retail_df["Amount"]= retail_df["Quantity"]*retail_df["UnitPrice"]
retail_df.head(10)

In [None]:
retail_df.dtypes #to see data types

In [None]:
retail_df.describe(exclude="number") #a brief summary for columns whose data types are NOT NUMBER

In [None]:
retail_df.describe() #a brief summary for columns whose data types are NUMBER

In [None]:
retail_df.shape # #of rows, #of columns

In [None]:
#convertin a column's datatype to datetime
retail_df["InvoiceDate"] = pd.to_datetime(retail_df["InvoiceDate"])
retail_df.dtypes


In [None]:
retail_df.head()

In [None]:
#I would like to make analysis for only one country but which one? Let's see the proportions

retail_df.Country.value_counts(normalize=True) #this code counts all values in Country column (and see proportions with the parameter "normalize=True")

In [None]:
# proportion of UK customers

retail_df[retail_df["Country"]=="United Kingdom"].CustomerID.nunique() / retail_df.CustomerID.nunique() #90% of the customers are from UK

In [None]:
#forming a sub-sample which includes only the customers from UK

retail_uk = retail_df[retail_df['Country']=='United Kingdom']

#check the shape
retail_uk.shape

In [None]:
#remove canceled orders
retail_uk = retail_uk[retail_uk['Quantity']>0]
retail_uk.shape

In [None]:
#remove rows where customerID are NA since we are going to do CUSTOMER SEGMENTATION
retail_uk.dropna(subset=['CustomerID'],how='all',inplace=True) # check out the documentation for dropping rows
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html


retail_uk.shape

In [None]:
# We would like to restrict the data to one full year because it's better to use a metric per Months or Years in RFM
retail_uk['InvoiceDate'].max()

In [None]:
retail_uk['InvoiceDate'].min()

In [None]:
# I am taking the last one year in the dataset
retail_uk = retail_uk[retail_uk['InvoiceDate']>= "2010-12-09"]
retail_uk.shape

In [None]:
retail_uk['InvoiceDate'].max()

In [None]:
retail_uk['InvoiceDate'].min()

In [None]:
# Average total quantity ordered by a customer
np.mean(retail_uk.groupby("CustomerID").Quantity.sum())

In [None]:
# Average total amount spent by a customer
np.mean(retail_uk.groupby("CustomerID").Amount.sum())

In [None]:
# Average quantity in a row
# Note that each row represent a purchased (by a customer) product in an invoice
# In other words, primary key for this data set is __InvoiceNo-StockCode-CustomerID__
retail_uk.Quantity.mean()

In [None]:
# Average amount in a row
retail_uk.Amount.mean()

In [None]:
# Number of unique products sold between 2010-12-09 and 2011-12-09
retail_uk.StockCode.nunique()

In [None]:
print("Summary..")
#exploring the unique values of each attribute
print("Number of invoices: ", retail_uk['InvoiceNo'].nunique())
print("Number of products bought: ",retail_uk['StockCode'].nunique())
print("Number of customers:", retail_uk['CustomerID'].nunique() )
print("Percentage of customers NA: ", round(retail_uk['CustomerID'].isnull().sum() * 100 / len(retail_uk),2),"%" )
print("Average quantity of product purchased by a customer: ", round(np.mean(retail_uk.groupby("CustomerID").Quantity.sum()), 0))
print("Average revenue generated per customer: ", round(np.mean(retail_uk.groupby("CustomerID").Amount.sum()), 2))
print("Average product quantity sold per transaction: ", round(retail_uk.Quantity.mean(), 0))
print("Average revenue generated per transaction: ", round(retail_uk.Amount.mean(), 2) )

In [None]:
retail_uk.describe(exclude='number')

In [None]:
retail_uk.describe()

In [None]:
#check for NaN's to see if dataset is ready to go

retail_uk.info()

In [None]:
#To calculate recency, we need a reference
#The difference (in days) between NOW and date of invoice will give us recency. Range of recency will be (0, 365)
now = dt.date(2011,12,9)
print(now)

In [None]:
#Create a new column called date which contains the date of invoice only
retail_uk['date'] = pd.DatetimeIndex(retail_uk['InvoiceDate']).date

In [None]:
retail_uk.head()

In [None]:
#CREATE RECENCY DATAFRAME
#group by customers and check the last date of purchase
recency_df = retail_uk.groupby(by='CustomerID', as_index=False)['date'].max()
recency_df.columns = ['CustomerID','LastPurshaceDate']
recency_df.head()

In [None]:
#calculate recency
recency_df['Recency'] = recency_df['LastPurshaceDate'].apply(lambda x: (now - x).days)

In [None]:
recency_df.head()

In [None]:
#drop LastPurchaseDate as we don't need it anymore
recency_df.drop('LastPurshaceDate',axis=1,inplace=True)

In [None]:
#CREATE FREQUENCY DATAFRAME
# drop duplicates
retail_uk_copy = retail_uk
retail_uk_copy.drop_duplicates(subset=['InvoiceNo', 'CustomerID'], inplace=True) 

#What we do is here is a bit complex compared to previous codes. I would like to calculate the number of invoices for each induvidual customer.
#Since dataset involves so many duplicates of ['InvoiceNo', 'CustomerID'], we are keeping just one.
#Call "retail_uk.head()" and look these two columns to understand better

#Calculate frequency of purchases
frequency_df = retail_uk_copy.groupby(by=['CustomerID'], as_index=False)['InvoiceNo'].count()
frequency_df.columns = ['CustomerID','Frequency']
frequency_df.head()

In [None]:
retail_uk_copy.info()

In [None]:
#Amount spent by each individual customer
monetary_df = retail_uk.groupby(by='CustomerID',as_index=False)['Amount'].sum()
monetary_df.columns = ['CustomerID','Monetary']
monetary_df.head()

In [None]:
#RFM TABLE

#merge recency dataframe with frequency dataframe, resulting a temporary dataframe
temp_df = recency_df.merge(frequency_df,on='CustomerID')
temp_df.head()

In [None]:
#merge with monetary dataframe to get a table with the 3 columns
rfm_df = temp_df.merge(monetary_df,on='CustomerID')
#check the head
rfm_df.head()

In [None]:
# make 0's 1, I do not want to have 0's in Recency column
rfm_df['Recency'] = rfm_df["Recency"] + 1

In [None]:
rfm_df.describe()
#As seen, we have 3863 customer with R-F-M values

In [None]:
#outlier treatment: the algorithm we are going to use is vulnerable to outliers which are very very manipulative
#if you do not exclude outliers, the results will be heavily influenced

#outlier treatment for recency
Q1 = rfm_df.Recency.quantile(0.25)
Q3 = rfm_df.Recency.quantile(0.75)
IQR = Q3 - Q1
rfm_df = rfm_df[(rfm_df.Recency >= (Q1 - 1.5*IQR)) & (rfm_df.Recency <= (Q3 + 1.5*IQR))]
rfm_df.describe()

In [None]:
#outlier treatment for frequency
Q1 = rfm_df.Frequency.quantile(0.25)
Q3 = rfm_df.Frequency.quantile(0.75)
IQR = Q3 - Q1
rfm_df = rfm_df[(rfm_df.Frequency >= (Q1 - 1.5*IQR)) & (rfm_df.Frequency <= (Q3 + 1.5*IQR))]
rfm_df.describe()

In [None]:
#outlier treatment for monetary
Q1 = rfm_df.Monetary.quantile(0.25)
Q3 = rfm_df.Monetary.quantile(0.75)
IQR = Q3 - Q1
rfm_df = rfm_df[(rfm_df.Monetary >= (Q1 - 1.5*IQR)) & (rfm_df.Monetary <= (Q3 + 1.5*IQR))]
rfm_df.describe()

#After excluding outliers, 3147 customers are left

In [None]:
#importing modules for k-Means

from sklearn.preprocessing import MinMaxScaler, StandardScaler #I am going to scale variables manually but it is possible to use one of the scalers here
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree

In [None]:
#scaling is crucial step to make k-Means clustering, scaling range could differ

#scaling Recency values to the range (0,1)
rfm_df["R"]= (rfm_df["Recency"]-rfm_df["Recency"].min())/(rfm_df["Recency"].max()-rfm_df["Recency"].min())

In [None]:
#scaling Frequency values to the range (0,1)
rfm_df["F"]= (rfm_df["Frequency"]-rfm_df["Frequency"].min())/(rfm_df["Frequency"].max()-rfm_df["Frequency"].min())

In [None]:
#scaling Monetary values to the range (0,1)
rfm_df["M"]= (rfm_df["Monetary"]-rfm_df["Monetary"].min())/(rfm_df["Monetary"].max()-rfm_df["Monetary"].min())
rfm_df.head()

#In this analysis, the value of the customer is inversely proportional to the Recency value and directly proportional to the Frequency and Monetary values.

In [None]:
#now, we formed a new DataFrame, which RFM analysis can be made

df = rfm_df
df.head()

In [None]:
df.describe()

In [None]:
df["Recency"].hist()

In [None]:
df["Frequency"].hist()

In [None]:
df["Monetary"].hist()

In [None]:
fig, (ax0, ax1, ax2) = plt.subplots(nrows= 1, ncols= 3, sharey=True, figsize=(15, 6))

sns.distplot(df["Recency"], ax=ax0, kde=False, color='b')
ax0.set(xlabel="Recency", ylabel="Number of customers")

sns.distplot(df["Frequency"], ax=ax1, kde=False, color='r')
ax1.set(xlabel="Frequency")

sns.distplot(df["Monetary"], ax=ax2, kde=False, color='g')
ax2.set(xlabel="Monetary")


fig.savefig("Histograms of Attributes")

In [None]:
#resetting index for future concatenate purposes
#pd.concat() needs same indexes to concatenate pandas objects, otherwise some rows will be lost

df.reset_index(drop=True, inplace=True)
df

In [None]:
#HOPKIN's STATISTICS
#tells how much data is suitable to cluster

from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins(df[["R","F","M"]])

In [None]:
#forming a new DF which includes only scaled R,F,M scores

df_1 = df[["CustomerID","R","F","M"]]
df_1.set_index("CustomerID", inplace=True)
df_1.head(20)

In [None]:
#DENDROGRAM

d = linkage(df_1, method='complete')
dendrogram(d,
          leaf_rotation=90,
          leaf_font_size=5)
plt.show()

In [None]:
#From dendrogram, it can be said that the possible numbers of clusters are 3,4,5,6 

#First Clustering

model = KMeans(n_clusters= 4, init= 'random', max_iter= 500, tol= 1e-10)

In [None]:
model.fit(df_1)

In [None]:
model.cluster_centers_

In [None]:
model.labels_

In [None]:
model.inertia_

In [None]:
model.n_iter_

In [None]:
#FIND OPTIMUM "K" for k-Means

from sklearn.metrics import silhouette_score
sse_ = []
ssd = []
centroids_from_kmeans = []
iterations = []

for k in range(2, 15):
    kmeans = KMeans(n_clusters= k, init= 'random', n_init= 100, max_iter= 500, tol= 1e-10).fit(df_1)
    sse_.append([k, silhouette_score(df_1, kmeans.labels_)])
    ssd.append([k, kmeans.inertia_])
    centroids_from_kmeans.append(kmeans.cluster_centers_)
    iterations.append([k, kmeans.n_iter_])

print("Silhouette Score for each K : \n", sse_)
print("Inertia for each K : \n", ssd)
print("Number of iterations for each K : ", iterations)

In [None]:
plt.plot(pd.DataFrame(sse_)[0], pd.DataFrame(sse_)[1], label= "Silhouette Score")
plt.title("Silhouette Scores for Varying Number of Clusters")
plt.xlabel("# of clusters")
plt.ylabel("Silhouette Score")
plt.show()

In [None]:
plt.plot(pd.DataFrame(ssd)[0], pd.DataFrame(ssd)[1], label= "Inertia")
plt.title("Inertia of K-means Clustering Results")
plt.xlabel("# of clusters")
plt.ylabel("Inertia")
plt.show()

In [None]:
#FIND OPTIMUM "K" for k-Means

sse_ward = []

for k in range(2, 15):
    h_cluster = AgglomerativeClustering(n_clusters= k)
    h_cluster.fit(df_1)
    sse_ward.append([k, silhouette_score(df_1, h_cluster.labels_)])

print(sse_ward)


In [None]:
plt.plot(pd.DataFrame(sse_ward)[0], pd.DataFrame(sse_ward)[1], label= "Silhouette Score")
plt.title("Silhouette Scores for Varying Number of Clusters")
plt.xlabel("# of clusters")
plt.ylabel("Silhouette Score")
plt.show()

In [None]:
#since low inertia and high silhouette score is desirable, let "K" (number of clusters) equals 3
#this is also in parallel with what dendrogram suggests


#Hierarchical Cluster Analyses to find initial seeds for k-means

clustering= AgglomerativeClustering(n_clusters= 3) #linkage="ward", by default
clustering.fit(df_1)

In [None]:
#Just checking, Agglomerative clustering results

labels_v0= pd.DataFrame(clustering.labels_)
labels_v0
RFM_0= pd.concat([df, labels_v0], axis=1)
RFM_0.columns= ["CustomerID", "Recency", "Frequency", "Monetary", "R", "F", "M", "Cluster"]
RFM_0["Cluster"] = RFM_0["Cluster"] +1
RFM_ward= RFM_0
RFM_ward.head(10)

In [None]:
RFM_ward.groupby("Cluster").CustomerID.count()

In [None]:
Clusters_R= pd.DataFrame(RFM_ward.groupby("Cluster").R.mean())
Clusters_R

In [None]:
Clusters_F= pd.DataFrame(RFM_ward.groupby("Cluster").F.mean())
Clusters_F

In [None]:
Clusters_M= pd.DataFrame(RFM_ward.groupby("Cluster").M.mean())
Clusters_M

In [None]:
#initial seeds for k-means

clusters_ward= pd.concat([Clusters_R, Clusters_F, Clusters_M], axis=1)
initial_seeds= clusters_ward.to_numpy() #converting to numpy array
print(initial_seeds.dtype)
print(initial_seeds)

In [None]:
inertias_in_each_iteration= []
centroids= []
number_of_iterations = []

for i in range(20):
    model= KMeans(n_clusters= 3, init= initial_seeds, max_iter= 500, tol= 1e-10)
    model.fit(df_1)
    inertias_in_each_iteration.append(model.inertia_)
    centroids.append(model.cluster_centers_)
    number_of_iterations.append(model.n_iter_)

print(inertias_in_each_iteration)
print(centroids[-1])
print(number_of_iterations)
labels= model.labels_
labels= pd.Series(labels)
print(silhouette_score(df_1, labels))

#compare this silhouette value with the silhouette value obtained from k-means with 'random' initial seeds

In [None]:
inertias_in_each_iteration= []
centroids= []
number_of_iterations = []

for i in range(300):
    model= KMeans(n_clusters= 3, init= 'random', n_init= 10, max_iter= 500, tol= 1e-10)
    model.fit(df_1)
    inertias_in_each_iteration.append(model.inertia_)
    centroids.append(model.cluster_centers_)
    number_of_iterations.append(model.n_iter_)

print(np.mean(inertias_in_each_iteration))
print(centroids[-1])
print(np.mean(number_of_iterations))
labels= model.labels_
labels= pd.Series(labels)
print(silhouette_score(df_1, labels))

#compare this silhouette value with the silhouette value obtained from k-means with 'random' initial seeds

In [None]:
#resulting dataframe showing customer R,F,M values and which cluster the customer belongs to

RFM_1 = pd.concat([df, pd.Series(labels)], axis= 1)
RFM_1.columns= ["CustomerID", "Recency", "Frequency", "Monetary", "R", "F", "M", "Cluster"]
RFM_1["Cluster"] = RFM_1["Cluster"] +1
RFM_1.head()


In [None]:
RFM_1.describe()

In [None]:
RFM_1.info()

In [None]:
# number of customers in each cluster

RFM_1.groupby("Cluster").CustomerID.count()

In [None]:
Clusters_Recency= pd.DataFrame(RFM_1.groupby("Cluster").Recency.mean())
Clusters_Recency

In [None]:
Clusters_Frequency= pd.DataFrame(RFM_1.groupby("Cluster").Frequency.mean())
Clusters_Frequency

In [None]:
Clusters_Monetary= pd.DataFrame(RFM_1.groupby("Cluster").Monetary.mean())
Clusters_Monetary

In [None]:
Clusters_R= pd.DataFrame(RFM_1.groupby("Cluster").R.mean())
Clusters_R

In [None]:
Clusters_F= pd.DataFrame(RFM_1.groupby("Cluster").F.mean())
Clusters_F

In [None]:
Clusters_M= pd.DataFrame(RFM_1.groupby("Cluster").M.mean())
Clusters_M

In [None]:
#cluster centroids

clusters_1= pd.concat([Clusters_Recency, Clusters_Frequency, Clusters_Monetary, Clusters_R, Clusters_F, Clusters_M], axis=1)
clusters_1

In [None]:
#creating 4 new dataframe (1 for each cluster)

cluster_1= RFM_1[RFM_1["Cluster"]==1]
cluster_2= RFM_1[RFM_1["Cluster"]==2]
cluster_3= RFM_1[RFM_1["Cluster"]==3]
cluster_1.head(10)

In [None]:
g= sns.catplot(x="Cluster", y="Recency", data= RFM_1, kind="box")

In [None]:
g= sns.catplot(x="Cluster", y="Recency", data= RFM_1, kind="bar")

In [None]:
g= sns.catplot(x="Cluster", y="Frequency", data= RFM_1, kind="box")

In [None]:
g= sns.catplot(x="Cluster", y="Frequency", data= RFM_1, kind="bar")

In [None]:
g= sns.catplot(x="Cluster", y="Monetary", data= RFM_1, kind="box")

In [None]:
g= sns.catplot(x="Cluster", y="Monetary", data= RFM_1, kind="bar")

In [None]:
sns.relplot(x="Recency", y="Frequency",
           data=RFM_1, kind="scatter",
           hue="Cluster", style="Cluster", alpha=0.7)
plt.show()

In [None]:
sns.relplot(x="Recency", y="Monetary",
           data=RFM_1, kind="scatter",
           hue="Cluster", style="Cluster", alpha=0.7)
plt.show()

In [None]:
sns.relplot(x="Frequency", y="Monetary",
           data=RFM_1, kind="scatter",
           hue="Cluster", style="Cluster", alpha=0.7)
plt.show()

In [None]:
sns.pairplot(RFM_1, vars=["Recency", "Frequency", "Monetary"], hue="Cluster",
             diag_kind= 'auto', diag_kws={'alpha': 0.4}, corner=False,
            palette="husl", plot_kws={'alpha': 0.4},  height=3, aspect=1)

In [None]:
RFM_1.Recency.mean()

In [None]:
RFM_1.Frequency.mean()

In [None]:
RFM_1.Monetary.mean()