In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Nov  14 10:31:35 2020
@author: harikumar balakrishnan
"""

#### Load Libraries
Load Libraries for Data Manipulation

In [None]:
import pandas as pd
import numpy as np
import random 
import datetime as dt
import re
import pickle

Load Libraries for WordCloud Analysis

In [None]:
import nltk, warnings
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

Load Libraries for Kmeans Clustering

In [None]:
!pip install --upgrade pip
!pip install feature_engine

In [None]:
from datetime import timedelta
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE
from feature_engine.outlier_removers import Winsorizer

Load Libraries for Plot Libraries 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
from sklearn import metrics
import plotly.graph_objects as go
import plotly.express as px
pd.set_option('display.max_rows', 100)
%config InlineBackend.figure_format = 'svg'

In [None]:
warnings.filterwarnings("ignore")

Reading the dataset - Kaggle Dataset Ecom - Clarie1

In [None]:
df_ecom = pd.read_csv('../input/ecommerce-data/data.csv',encoding="ISO-8859-1",dtype={'CustomerID': str,'InvoiceID': str})
df_ecom.head()
df_ecom.shape

#### Data Prep & Manipulation

In [None]:
df_cleaned = df_ecom.copy(deep = True)
df_cleaned['QuantityCanceled'] = 0

In [None]:
entry_to_remove = [] ; doubtfull_entry = []

In [None]:
remaining_entries = df_cleaned[(df_cleaned['Quantity'] < 0) & (df_cleaned['StockCode'] != 'D')]
print("nb of entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]

In [None]:
df_cleaned.drop(remaining_entries.index, axis = 0, inplace = True)

In [None]:
list_special_codes = df_cleaned[df_cleaned['StockCode'].str.contains('^[a-zA-Z]+', regex=True)]['StockCode'].unique()
list_special_codes

In [None]:
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'POST']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'D']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'C2']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'M']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'BANK CHARGES']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'PADS']
df_cleaned = df_cleaned[df_cleaned['StockCode']!= 'DOT']

In [None]:
df_cleaned[(df_cleaned['UnitPrice'] == 0)].head(5)
df_cleaned['TotalPrice'] = df_cleaned['UnitPrice'] * (df_cleaned['Quantity'] - df_cleaned['QuantityCanceled'])

#### Word Cloud / Cohort Analysis<br>
For this analysis considering United Kingdom as it falls on highest propotion compared to the rest of the countries

In [None]:
uk_ecom = df_cleaned[df_cleaned['Country']=='United Kingdom']
uk_ecom['Description'] = uk_ecom['Description'].astype(str)
freq = pd.Series(' '.join(uk_ecom['Description']).split()).value_counts()[:20]
freq

#### Identify uncommon words

In [None]:
freq1 =  pd.Series(' '.join(uk_ecom['Description']).split()).value_counts()[-20:]
freq1

In [None]:
stop_words = set(stopwords.words("english"))
new_words = ['RED','PINK', 'BLUE', 'OF', 'BROWN',"BLACK"]
stop_words = stop_words.union(new_words)

In [None]:
for i in new_words:
  if i in stop_words:
    print(i)

In [None]:
corpus = []

In [None]:
for i in range(0, 8789):
	text = re.sub('[^a-zA-Z]', ' ', uk_ecom['Description'].iloc[i])
	text = text.lower()
	text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
	text=re.sub("(\\d|\\W)+"," ",text)
	text = text.split()
	ps=PorterStemmer()
	lem = WordNetLemmatizer()
	text = [lem.lemmatize(word) for word in text if word not in stop_words]
	text = " ".join(text)
	i
	corpus.append(text)

In [None]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud = WordCloud(    background_color='white',
                          stopwords=stop_words,
                          max_words=200,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(corpus))

In [None]:
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Word Cloud for Customer\'s Products')

#### Cohort Analysis

In [None]:
def extract_days(x):
	return dt.datetime(x.year, x.month, x.day)

In [None]:
uk_ecom['date'] = pd.DatetimeIndex(uk_ecom['InvoiceDate']).date
uk_ecom['InvoiceDay'] = uk_ecom['date'].apply(extract_days)
grouping = uk_ecom.groupby('CustomerID')['InvoiceDay']
uk_ecom['CohortDay'] = grouping.transform('min')
print(uk_ecom.head())

Function to extract Year/Month

In [None]:
def extract_month_int(x):
    return dt.datetime(x.year, x.month, 1)

Create a column InvoiceMonth

In [None]:
uk_ecom['InvoiceMonth'] = uk_ecom['date'].apply(extract_month_int)
grouping = uk_ecom.groupby('CustomerID')['InvoiceMonth']
uk_ecom['CohortMonth'] = grouping.transform('min')
uk_ecom.head()

In [None]:
def extract_dates_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    day = df[column].dt.day
    return year, month, day

In [None]:
invoice_year, invoice_month, _ = extract_dates_int(uk_ecom, 'InvoiceMonth')
cohort_year, cohort_month, _ = extract_dates_int(uk_ecom, 'CohortMonth')
years_difference = invoice_year - cohort_year
months_difference = invoice_month - cohort_month

~365 days in one year, ~30 days in one month and plus 1 day to differ from zero value

In [None]:
uk_ecom['CohortIndex'] = years_difference * 12 + months_difference + 1
grouping = uk_ecom.groupby(['CohortMonth', 'CohortIndex'])
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index()

Creating cohort pivot table 

In [None]:
cohort_counts = cohort_data.pivot(index = 'CohortMonth', columns = 'CohortIndex', values = 'CustomerID')
cohort_sizes = cohort_counts.iloc[:, 0]
retention = cohort_counts.divide(cohort_sizes, axis = 0)

Review the retention table

In [None]:
retention.round(3) * 100
grouping_avg_quantity = uk_ecom.groupby(['CohortMonth', 'CohortIndex'])
cohort_data_avg_quantity = grouping_avg_quantity['Quantity'].mean().reset_index()
average_quantity = cohort_data_avg_quantity.pivot(index = 'CohortMonth', columns = 'CohortIndex', values = 'Quantity')
average_quantity.round(1).fillna('')

Build a figure

In [None]:
plt.figure(figsize = (10, 5))
plt.title('Retention Rate for Customers in United Kingdom')
sns.heatmap(data = retention, annot = True, fmt = '.0%', vmin = 0.01, vmax = 0.5, cmap = 'BuGn')
plt.show()

Initialize a heatmap grapgh 

#### RFM Analysis <br>
Calcuation Monetary

In [None]:
df_cleaned['TotalSum'] = df_cleaned['Quantity'] * df_cleaned['UnitPrice']

Calculation Recency 

In [None]:
df_cleaned['InvoiceDate'] = pd.to_datetime(df_cleaned['InvoiceDate'])
snapshot_date = df_cleaned['InvoiceDate'].max() + timedelta(days=1)

Grouping by CustomerID and Calculation frequency 

In [None]:
rfm = df_cleaned.groupby(['CustomerID']).agg({
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'count',
        'TotalSum': 'sum'})

Rename the columns 

In [None]:
rfm.rename(columns={'InvoiceDate': 'Recency',
                         'InvoiceNo': 'Frequency',
                         'TotalSum': 'Monetary'}, inplace=True)

--Calculate R and F groups-- Create labels for Recency and Frequency

In [None]:
r_labels = range(4, 0, -1); f_labels = range(1, 5)

Assign these labels to 4 equal percentile groups 

In [None]:
r_groups = pd.qcut(rfm['Recency'], q=4, labels=r_labels)

Assign these labels to 4 equal percentile groups 

In [None]:
f_groups = pd.qcut(rfm['Frequency'], q=4, labels=f_labels)

Create new columns R and F 

In [None]:
rfm = rfm.assign(R = r_groups.values, F = f_groups.values)

Create labels for Monetary

In [None]:
m_labels = range(1, 5)

Assign these labels to three equal percentile groups 

In [None]:
m_groups = pd.qcut(rfm['Monetary'], q=4, labels=m_labels)

Create new column M for Monetary

In [None]:
rfm = rfm.assign(M = m_groups.values)

Calculation for RFM score (R+F+M)

In [None]:
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])
rfm['RFM_Segment_Concat'] = rfm.apply(join_rfm, axis=1)

Calculate RFM_Score

In [None]:
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)

Define rfm_level function

In [None]:
def rfm_level(df):
    if df['RFM_Score'] >= 9:
        return "Can't Loose Them"
    elif ((df['RFM_Score'] >= 8) and (df['RFM_Score'] < 9)):
        return 'Champions'
    elif ((df['RFM_Score'] >= 7) and (df['RFM_Score'] < 8)):
        return 'Loyal'
    elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 7)):
        return 'Potential'
    elif ((df['RFM_Score'] >= 5) and (df['RFM_Score'] < 6)):
        return 'Promising'
    elif ((df['RFM_Score'] >= 4) and (df['RFM_Score'] < 5)):
        return 'Needs Attention'
    else:
        return 'Require Activation'
    
# Create a new variable RFM_Level
rfm['RFM_Level'] = rfm.apply(rfm_level, axis=1)
rfm.reset_index(level=0, inplace=True)

Plotting RFM_Level

In [None]:
plo1 = rfm.groupby('RFM_Level')['CustomerID'].nunique().sort_values(ascending=False).reset_index()
sns.set(rc={'figure.figsize':(8, 4)})
sns.barplot(data=plo1, x = 'CustomerID', y = 'RFM_Level', 
			palette = 'Greens_d', orient = 'h')

In [None]:
rfm_uc = rfm.copy()
quantiles = rfm_uc.quantile(q=[0.8])
print(quantiles)
rfm_uc['R']=np.where(rfm_uc['Recency']<=int(quantiles.Recency.values), 2, 1)
rfm_uc['F']=np.where(rfm_uc['Frequency']>=int(quantiles.Frequency.values), 2, 1)
rfm_uc['M']=np.where(rfm_uc['Monetary']>=int(quantiles.Monetary.values), 2, 1)
rfm_uc.head()

To do the 2 x 2 matrix we will only use Recency & Monetary

In [None]:
rfm_uc['RMScore'] = rfm_uc.M.map(str)+rfm_uc.R.map(str)
rfm_uc = rfm_uc.reset_index()
df_RFM_SUM = rfm_uc.groupby('RMScore').agg({'CustomerID': lambda y: len(y.unique()),
                                        'Frequency': lambda y: round(y.mean(),0),
                                        'Recency': lambda y: round(y.mean(),0),
                                        'R': lambda y: round(y.mean(),0),
                                        'M': lambda y: round(y.mean(),0),
                                        'Monetary': lambda y: round(y.mean(),0)})
df_RFM_SUM = df_RFM_SUM.sort_values('RMScore', ascending=False)
df_RFM_SUM.head()

1) Average Monetary Matrix

In [None]:
df_RFM_M = df_RFM_SUM.pivot(index='M', columns='R', values='Monetary')
df_RFM_M= df_RFM_M.reset_index().sort_values(['M'], ascending = False).set_index(['M'])
print(df_RFM_M)

2) Number of Customer Matrix

In [None]:
df_RFM_C = df_RFM_SUM.pivot(index='M', columns='R', values='CustomerID')
df_RFM_C= df_RFM_C.reset_index().sort_values(['M'], ascending = False).set_index(['M'])
print(df_RFM_C)

3) Recency

In [None]:
df_RFM_R = df_RFM_SUM.pivot(index='M', columns='R', values='Recency')
df_RFM_R= df_RFM_R.reset_index().sort_values(['M'], ascending = False).set_index(['M'])
print(df_RFM_R)

#### Kmeans Clustering
Normalization for Kmeans Clustering

In [None]:
windsoriser = Winsorizer(distribution='skewed', 
                          tail='both', # cap left, right or both tails 
                          fold=2,
                           variables=[ 'Recency', 'Frequency', 'Monetary']
                        )

In [None]:
df_rfm_log = rfm_uc[['CustomerID', 'Recency', 'Frequency', 'Monetary']] 
rfm_or = rfm_uc[['CustomerID', 'Recency', 'Frequency', 'Monetary']] 
df_rfm_log.describe()

In [None]:
df_rfm_log['CustomerID'] = df_rfm_log['CustomerID'].astype(int)
rfm_or['CustomerID'] = rfm_or['CustomerID'].astype(int)

In [None]:
df_rfm_log = np.log(df_rfm_log[['Recency', 'Frequency', 'Monetary']] +1)
windsoriser.fit(df_rfm_log)
df_rfm_log = windsoriser.transform(df_rfm_log)
scaler = StandardScaler()
scaler.fit(df_rfm_log)

In [None]:
RFM_Table_scaled = scaler.transform(df_rfm_log)
RFM_Table_scaled = pd.DataFrame(RFM_Table_scaled, columns=df_rfm_log.columns)

Kmeans Silhoutte Score

In [None]:
range_n_clusters = range(5,6)
eval_scores = {'model':[],'n_clusters':[], 's_score':[], 'c_score':[], 'db_score':[]}

In [None]:
for n_clusters in range_n_clusters:
    fig=plt.figure()
    ax=plt.axes()
    
    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(RFM_Table_scaled) + (n_clusters + 1) * 10])
    clusterer = KMeans(n_clusters=4, random_state=77, init='k-means++')
    eval_scores['model'].append('KMeans')
    
    cluster_labels = clusterer.fit_predict(RFM_Table_scaled)
    silhouette_avg = silhouette_score(RFM_Table_scaled, cluster_labels, random_state = 77)
   
    eval_scores['n_clusters'].append(n_clusters)
    eval_scores['s_score'].append(silhouette_avg)
    eval_scores['c_score'].append(metrics.calinski_harabasz_score(RFM_Table_scaled, cluster_labels))
    eval_scores['db_score'].append(metrics.davies_bouldin_score(RFM_Table_scaled, cluster_labels))
    
    sample_silhouette_values = silhouette_samples(RFM_Table_scaled, cluster_labels)
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i) / n_clusters)
        ax.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)
        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples
    ax.set_title("The silhouette plot for the various clusters.")
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax.set_yticks([])  # Clear the yaxis labels / ticks
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

In [None]:
plt.show()
fig.savefig('silhouette.png')

Kmeans Function


In [None]:
def kmeans(normalised_df_rfm, clusters_number, original_df_rfm):
    
    kmeans = KMeans(n_clusters = clusters_number, random_state = 1)
    kmeans.fit(normalised_df_rfm)

    # Extract cluster labels
    cluster_labels = kmeans.labels_
        
    # Create a cluster label column in original dataset
    df_new = original_df_rfm.assign(Cluster = cluster_labels)
    
    # Initialise TSNE
    model = TSNE(random_state=1)
    transformed = model.fit_transform(df_new)
    
    # Plot t-SNE
    plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
    sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, style=cluster_labels, palette="Set1")
    
    return df_new

In [None]:
plt.figure(figsize=(10,5))
df_rfm_k4 = kmeans(RFM_Table_scaled, 4, rfm_or)

In [None]:
def rfm_values(df):
    df_new = df.groupby(['Cluster']).agg({
        'Recency': 'mean',
        'Frequency': 'mean',
        'Monetary': ['mean', 'count']
    }).round(0)
    
    return df_new

#### Visualizations for EDA

In [None]:
revenue_per_countries = df_cleaned.groupby(["Country"])["TotalPrice"].sum().sort_values()
revenue_per_countries = pd.DataFrame(revenue_per_countries)
revenue_per_countries['percent'] = revenue_per_countries['TotalPrice']/revenue_per_countries['TotalPrice'].sum()
revenue_per_countries['percent'] = revenue_per_countries['percent']*100
revenue_per_countries = revenue_per_countries.sort_values(by=['percent'], ascending=False)
revenue_per_countries = revenue_per_countries.head(15)

In [None]:
plt.figure(figsize=(8,4))
revenue_per_countries = revenue_per_countries.reset_index()
barplot = plt.bar(revenue_per_countries['Country'], revenue_per_countries['percent'], color = 'lightgreen', alpha = 0.90)
barplot[0].set_color('darkgreen')
barplot[1].set_color('darkgreen')
barplot[2].set_color('darkgreen')
plt.xlabel('Country', fontsize = 15, weight = 'bold')
plt.ylabel('Percent', fontsize = 15, weight = 'bold')
plt.xticks(rotation=45)

Revenue Plot

In [None]:
df_cleaned['AmountSpent']=df_cleaned['Quantity']*df_cleaned['UnitPrice']
invoice_customer_df=df_cleaned.groupby(by=['InvoiceNo','InvoiceDate']).agg({'AmountSpent':sum,'CustomerID':max,'Country':max,}).reset_index()
invoice_customer_df.head()

In [None]:
monthly_unique_customer_df=df_cleaned.set_index('InvoiceDate')['CustomerID'].resample('M').nunique()
monthly_unique_customer_df
monthly_revenue_df=df_cleaned.set_index('InvoiceDate')['AmountSpent'].resample('M').sum()
monthly_rev_repeat_customer_df=invoice_customer_df.set_index('InvoiceDate').groupby([pd.Grouper(freq='M'),'CustomerID']).filter(lambda x:len(x) > 1).resample('M').sum()['AmountSpent']
monthly_rev_per_repeat_customers_df=monthly_rev_repeat_customer_df/monthly_revenue_df*100
monthly_rev_per_repeat_customers_df
monthly_repeat_customers_df=invoice_customer_df.set_index('InvoiceDate').groupby([pd.Grouper(freq='M'),'CustomerID']).filter(lambda x:len(x)>1).resample('M').nunique()['CustomerID']
monthly_repeat_customers_df

Plotting the Repeat Revenue Percentage

In [None]:
ax=pd.DataFrame(monthly_repeat_customers_df.values).plot(figsize=(12,8))
pd.DataFrame(monthly_unique_customer_df.values).plot(ax=ax,grid=True)
ax2=ax.twinx()
pd.DataFrame(monthly_rev_per_repeat_customers_df.values).plot(ax=ax2,kind='bar',color='lightgrey',alpha=0.3)
ax2.set_ylim([0,max(monthly_rev_per_repeat_customers_df.values)+30])
ax2.set_ylabel('Percentage (%)')
ax2.set_xticklabels([x.strftime('%m.%Y') for x in monthly_rev_per_repeat_customers_df.index])
ax.set_xlabel('Date')
ax.set_ylabel('Number of Customers')
ax.set_title('Number of Unique vs. Repeat Customers Over Time')
ax.legend(['Repeat Customers','All Customers'])
ax2.legend(['Repeat Revenue Percentage'],loc='upper right')
ax.set_ylim([0,monthly_unique_customer_df.values.max()+100])
ax2.set_ylim([0,100])
plt.xticks(range(len(monthly_repeat_customers_df.index)),[x.strftime('%m.%Y') for x in monthly_repeat_customers_df.index],rotation=45)
plt.show()

#### Thanks