In [4]:
! pip3 install chart_studio

In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(12,7)})

import chart_studio as cs
import plotly.offline as po
import plotly.graph_objs as gobj
po.init_notebook_mode(connected=True)

from mpl_toolkits import mplot3d

#### Here, we are using Online Retail Data containing transactions from 01/12/2010 and 09/12/2011
http://archive.ics.uci.edu/ml/datasets/online+retail

In [6]:
Rtl_data = pd.read_csv("../input/online-retailcsv/Online Retail.csv", encoding = 'unicode_escape',index_col=False)
Rtl_data.head()

In [7]:
Rtl_data.shape

#### Customer Distribution by country

In [8]:
country_cust_data=Rtl_data[['Country','CustomerID']].drop_duplicates()
country_cust_data.groupby(['Country'])['CustomerID'].aggregate('count').reset_index().sort_values('CustomerID', ascending=False)

> As we can see in above table that the customers are distributed differently in different contries. So, for better Visualization purpose we can perform analysis separately for each country.
**Now, we will Keep only United Kingdom data for further analysis.**

In [9]:
Rtl_data = Rtl_data.query("Country=='United Kingdom'").reset_index(drop=True)

#### Data Cleaning Steps:

In [10]:
#missing values in the dataset
Rtl_data.isnull().sum(axis=0)

In [11]:
#Remove missing values from CustomerID column, can ignore missing values in description column
Rtl_data = Rtl_data[pd.notnull(Rtl_data['CustomerID'])]

In [12]:
#Validate if there are any negative values in Quantity column
Rtl_data.Quantity.min()

In [13]:
#Validate if there are any negative values in UnitPrice column
Rtl_data.UnitPrice.min()

In [14]:
#Filter out records with negative values
Rtl_data = Rtl_data[(Rtl_data['Quantity']>0)]

In [15]:
#Convert the string date field to datetime
Rtl_data['InvoiceDate'] = pd.to_datetime(Rtl_data['InvoiceDate'])

In [16]:
Rtl_data.shape

#### Feature Extraction Steps:

In [17]:
#Adding new column depicting total amount
Rtl_data['TotalAmount'] = Rtl_data['Quantity'] * Rtl_data['UnitPrice']

In [18]:
Rtl_data.head()

#  **RFM Modelling**


* Recency(R) = Latest Date - Last Inovice Data 
* Frequency(F) = count of invoice no. of transaction(s)
* Monetary(M) = Sum of Total Amount for each customer

Note: The lower value of R is better. And the higher values of F & M are better. 

In [19]:
import datetime as dt

#Set Latest date 2011-12-10 as last invoice date was 2011-12-09. This is to calculate the number of days from recent purchase
Latest_Date = dt.datetime(2011,12,10)

#Create RFM Modelling scores for each customer
RFMScores = Rtl_data.groupby('CustomerID').agg({'InvoiceDate': lambda x: (Latest_Date - x.max()).days, 'InvoiceNo': lambda x: len(x), 'TotalAmount': lambda x: x.sum()})

#Convert Invoice Date into type int
RFMScores['InvoiceDate'] = RFMScores['InvoiceDate'].astype(int)

#Rename column names to Recency, Frequency and Monetary
RFMScores.rename(columns={'InvoiceDate': 'Recency', 
                         'InvoiceNo': 'Frequency', 
                         'TotalAmount': 'Monetary'}, inplace=True)

RFMScores.reset_index().head()

In [20]:
#Condition For better results : The Customer must have visited at least three times.
RFMScores = RFMScores.loc[(RFMScores["Frequency"] >= 3)]
RFMScores.reset_index().head()

> #####  Recency Descriptive Statistics 

In [21]:
RFMScores.Recency.describe()

In [22]:
x = RFMScores['Recency']
sns.histplot(x, label="100% Equities", kde=True, linewidth=0)
plt.title("Recency Distribution Curve")

> #####  Frequency Descriptive Statistics 

In [23]:
RFMScores.Frequency.describe()

In [24]:
x = RFMScores.query('Frequency < 1000')['Frequency']
sns.histplot(x, label="100% Equities", kde=True, linewidth=0)
plt.title("Frequency Distribution Curve")

> #####  Monetary Descriptive Statistics 

In [25]:
RFMScores.Monetary.describe()

In [26]:
x = RFMScores.query('Monetary < 10000')['Monetary']
sns.histplot(x, label="100% Equities", kde=True, linewidth=0)
plt.title("Moetary Distribution Curve")

Now, we will create **segments for R,F and M** using quantile values.

In [27]:
quantiles = RFMScores.quantile(q=[0.25,0.5,0.75])
quantiles = quantiles.to_dict()
quantiles 

In [28]:
#Function to create R segments, where '1' represents lower values of R
def RScoring(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

In [29]:
#Function to create F and M segments, where '1' represents higher values of both F and M
def FnMScoring(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1

In [30]:
#Calculate and Add R, F and M segment value columns in the existing dataset to show R, F and M segment values
RFMScores['R'] = RFMScores['Recency'].apply(RScoring, args=('Recency',quantiles,))
RFMScores['F'] = RFMScores['Frequency'].apply(FnMScoring, args=('Frequency',quantiles,))
RFMScores['M'] = RFMScores['Monetary'].apply(FnMScoring, args=('Monetary',quantiles,))
RFMScores.head()

In [31]:
#Calculate and Add RFMGroup value column showing combined concatenated score of RFM
RFMScores['RFMGroup'] = RFMScores.R.map(str) + RFMScores.F.map(str) + RFMScores.M.map(str)

#Calculate and Add RFMScore value column showing total sum of RFMGroup values
RFMScores['RFMScore'] = RFMScores[['R', 'F', 'M']].sum(axis = 1)
RFMScores.head()

Note: In Above Table, the **lower values** of RFMScore represents the **most loyal customers**.

In [32]:
#Assigning Loyalty Level to each customer
Loyalty_Level = ['Platinum', 'Gold', 'Silver', 'Bronze']
Score_cuts = pd.qcut(RFMScores.RFMScore, q = 4, labels = Loyalty_Level)
RFMScores['RFM_Loyalty_Level'] = Score_cuts.values

#Here qcut is used for dividing data in equal size beans.
RFMScores.reset_index().head()

In [33]:
#Validating the data for RFMGroup = 111
RFMScores[RFMScores['RFMGroup']=='111'].sort_values('Monetary', ascending=False).reset_index().head(10)

In [34]:
# Function for making Comparison plots
def Comp_plot(field1,field2):
    graph = RFMScores.query("Monetary < 50000 and Frequency < 2000")
    
    plot_data = [
        gobj.Scatter(
            x=graph.query("RFM_Loyalty_Level == 'Bronze'")[field1],
            y=graph.query("RFM_Loyalty_Level == 'Bronze'")[field2],
            mode='markers',
            name='Bronze',
            marker= dict(size= 11,
                line= dict(width=1),
                color= 'blue',
                opacity= 0.8
               )
        ),
            gobj.Scatter(
            x=graph.query("RFM_Loyalty_Level == 'Silver'")[field1],
            y=graph.query("RFM_Loyalty_Level == 'Silver'")[field2],
            mode='markers',
            name='Silver',
            marker= dict(size= 11,
                line= dict(width=1),
                color= 'green',
                opacity= 0.8
               )
        ),
            gobj.Scatter(
            x=graph.query("RFM_Loyalty_Level == 'Gold'")[field1],
            y=graph.query("RFM_Loyalty_Level == 'Gold'")[field2],
            mode='markers',
            name='Gold',
            marker= dict(size= 11,
                line= dict(width=1),
                color= 'red',
                opacity= 0.8
               )
        ),
        gobj.Scatter(
            x=graph.query("RFM_Loyalty_Level == 'Platinum'")[field1],
            y=graph.query("RFM_Loyalty_Level == 'Platinum'")[field2],
            mode='markers',
            name='Platinum',
            marker= dict(size= 11,
                line= dict(width=1),
                color= 'black',
                opacity= 0.8
               )
        ),
    ]

    plot_layout = gobj.Layout(
            xaxis= {'title': field1},
            yaxis= {'title': field2},
            title='{} Vs {} : Segments'.format(field1,field2)
        )
    
    fig = gobj.Figure(data=plot_data, layout=plot_layout)
    print("\n")
    po.iplot(fig)

    
#Recency Vs Frequency
Comp_plot('Recency','Frequency')    
    
#Frequency Vs Monetary
Comp_plot('Frequency','Monetary')

#Recency Vs Monetary
Comp_plot('Recency','Monetary')

In [35]:
RFMScores.tail(25)

In [36]:
#For ploting purpose we are assigning different colors to different loyalty levels.
color_dic={ "Platinum":"green", "Gold":"red", "Silver":"blue", "Bronze":"black" }

def Loyalty_fun(x):
    if (x>=3) & (x<=5) :
        return color_dic["Platinum"]
    elif (x>=6) & (x<=8) :
        return color_dic["Gold"]
    elif (x>=9) & (x<=10) :
        return color_dic["Silver"]
    elif (x>=11) & (x<=12) :
        return color_dic["Bronze"]
    
# Colors for Loyalty_Level
RFMScores['Color'] = RFMScores['RFMScore'].map(Loyalty_fun)
             
# Creating p
fig = plt.figure(figsize = (20, 15))
ax = plt.axes(projection ="3d")
ax.scatter3D(RFMScores['Recency'],RFMScores['Frequency'],RFMScores['Monetary'], c=RFMScores['Color'])
plt.title("3D scatter plot")
ax.set_xlabel('$Recency$', fontsize=20)
ax.set_ylabel('$Frequency$', fontsize=20)
ax.set_zlabel('$Monetary$', fontsize=20)
ax.set_xlim3d(0,200)
ax.set_ylim3d(0,1000)
ax.set_zlim3d(0,20000)
  
plt.title(" \"Frequency Vs Recency Vs Monetory\" with Loyalty Levels")
plt.show()

# **K-Means Clustering Model**

#### Feature Scaling Steps:

In [37]:
#Handle negative and zero values so as to handle infinite numbers during log transformation
def handle_neg_n_zero(num):
    if num <= 0:
        return 1
    else:
        return num
    
#Applying function to Recency and Monetary columns 
RFMScores['Recency'] = [handle_neg_n_zero(x) for x in RFMScores.Recency]
RFMScores['Monetary'] = [handle_neg_n_zero(x) for x in RFMScores.Monetary]
RFMScores

> **Data Log Transformation**

In [38]:
#Perform Log transformation to bring data into normal or near normal distribution
Log_Tfd_Data = RFMScores[['Recency', 'Frequency', 'Monetary']].apply(np.log, axis = 1).round(3)
Log_Tfd_Data

In [39]:
Recency_Plot_data = Log_Tfd_Data['Recency']
sns.histplot(Recency_Plot_data, label="100% Equities", kde=True, linewidth=0)
plt.title("Recency Distribution after Data normalization")

In [40]:
Frequency_Plot_data = Log_Tfd_Data.query('Frequency < 1000')['Frequency']
sns.histplot(Frequency_Plot_data, label="100% Equities", kde=True, linewidth=0)
plt.title("Frequency Distribution after Data normalization")

In [41]:
Monetary_Plot_data = Log_Tfd_Data.query('Monetary < 10000')['Monetary']
sns.histplot(Monetary_Plot_data, label="100% Equities", kde=True, linewidth=0)
plt.title("Monetary Distribution after Data normalization")

> **Data Standardization**

In [42]:
from sklearn.preprocessing import StandardScaler

#Bring the data on same scale
scaleobj = StandardScaler()
Scaled_Data = scaleobj.fit_transform(Log_Tfd_Data)

#Transform it back to dataframe
Scaled_Data = pd.DataFrame(Scaled_Data, index = RFMScores.index, columns = Log_Tfd_Data.columns)
Scaled_Data.describe()

In [43]:
Scaled_Data

Finding Optimal number of clusters (**K's value**) using **Elbow Method**

In [44]:
from sklearn.cluster import KMeans

sum_of_sq_dist = {}
for k in range(1,15):
    km = KMeans(n_clusters= k, init= 'k-means++', max_iter= 1000)
    km = km.fit(Scaled_Data)
    sum_of_sq_dist[k] = km.inertia_
    
#Plot the graph for the sum of square distance values and Number of Clusters
sns.pointplot(x = list(sum_of_sq_dist.keys()), y = list(sum_of_sq_dist.values()))
plt.xlabel('Number of Clusters(k)')
plt.ylabel('Sum of Square Distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [45]:
#Building the K-Means clustering model
KMean_clust = KMeans(n_clusters= 3, init= 'k-means++', max_iter= 1000, random_state=25)
KMean_clust.fit(Scaled_Data)

#Clusters for the observation given in the dataset
RFMScores['Cluster'] = KMean_clust.labels_
RFMScores

In [46]:
#Cluters Scatter Plot
fig = plt.figure(figsize = (20, 15))
ax = plt.axes(projection ="3d")

# Assigning colors according to clusters for clusters
Colors = ["red", "green", "blue"]
RFMScores['Color'] = RFMScores['Cluster'].map(lambda p: Colors[p])

# Creating plot
ax.scatter3D(RFMScores['Recency'],RFMScores['Frequency'],RFMScores['Monetary'], c=RFMScores['Color'])
plt.title("3D scatter plot")
ax.set_xlabel('$Recency$', fontsize=20)
ax.set_ylabel('$Frequency$', fontsize=20)
ax.set_zlabel('$Monetary$', fontsize=20)
ax.set_xlim3d(0,200)
ax.set_ylim3d(0,1000)
ax.set_zlim3d(0,20000)

# show plot
plt.title(" \"Frequency Vs Recency Vs Monetory\" with Clusters")
plt.show()

> **Distribution of Recency, Frequency and Monetary in each cluster:**

In [47]:
def Box_plotting(field_to_plot):
    cutoff_quantile = 90
    x_data = ['Cluster 1','Cluster 2','Cluster 3']
    y0 = RFMScores[RFMScores['Cluster']==0][field_to_plot].values
    y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
    y1 = RFMScores[RFMScores['Cluster']==1][field_to_plot].values
    y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
    y2 = RFMScores[RFMScores['Cluster']==2][field_to_plot].values
    y2 = y2[y2<np.percentile(y2, cutoff_quantile)]

    y_data = [y0,y1,y2]

    colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
    traces = []

    for xd, yd, cls in zip(x_data, y_data, colors):
            traces.append(gobj.Box(
                y=yd,
                name=xd,
                boxpoints=False,
                jitter=0.5,
                whiskerwidth=0.2,
                fillcolor=cls,
                marker=dict(
                    size=2,
                ),
                line=dict(width=1),
            ))

    layout = gobj.Layout(
        title='Difference in Sales \"{}\" from cluster to cluster'.format(field_to_plot),
        yaxis=dict(
            autorange=True,
            showgrid=True,
            zeroline=True,
            dtick=1000,
            gridcolor='black',
            gridwidth=0.1,
            zerolinecolor='rgb(255, 255, 255)',
            zerolinewidth=2,
        ),
        margin=dict(
            l=40,
            r=30,
            b=80,
            t=100,
        ),
        plot_bgcolor='white',
        showlegend=True
    )
    fig = gobj.Figure(data=traces, layout=layout)
    po.offline.iplot(fig)
    
Box_plotting("Recency")
print("\n")
Box_plotting("Frequency")
print("\n")
Box_plotting("Monetary")

> #### Now we will see the distribution of different type of customers in each cluster.  

In [48]:
# Separating RFMScores in clusters
clust1 = RFMScores[(RFMScores['Cluster']==0)]
c1=clust1['RFM_Loyalty_Level'].value_counts()
clust2 = RFMScores[(RFMScores['Cluster']==1)]
c2=clust2['RFM_Loyalty_Level'].value_counts()
clust3 = RFMScores[(RFMScores['Cluster']==2)]
c3=clust3['RFM_Loyalty_Level'].value_counts()

def Pie_num(s, x):
    return s[x]

Customer_labels = ['Platinum', 'Gold', 'Silver', 'Bronze']
print("\n")

#For cluster no.1
sizes1=[Pie_num(c1,Customer_labels[i]) for i in range(4)]
fig, ax = plt.subplots()
ax.pie(sizes1, labels=Customer_labels, autopct='%1.2f%%')
ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
ax.set_title('Customers distribution in Cluster 0')
plt.show()
print("\n")

#For cluster no.2
sizes2=[Pie_num(c2,Customer_labels[i]) for i in range(4)]
fig, ax = plt.subplots()
ax.pie(sizes2, labels=Customer_labels, autopct='%1.2f%%')
ax.axis('equal')  
ax.set_title('Customers distribution in Cluster 1')
plt.show()
print("\n")

#For cluster no.3 
sizes3=[Pie_num(c3,Customer_labels[i]) for i in range(4)]
fig, ax = plt.subplots()
ax.pie(sizes3, labels=Customer_labels, autopct='%1.2f%%')
ax.axis('equal')
ax.set_title('Customers distribution in Cluster 2')
plt.show()

### **Testing Phase: RFM Classifier**

In [49]:
def pred_cust(data):
    R = data['Recency'].apply(RScoring, args=('Recency',quantiles,))[0]
    F = data['Frequency'].apply(FnMScoring, args=('Frequency',quantiles,))[0]
    M = data['Monetary'].apply(FnMScoring, args=('Monetary',quantiles,))[0]
    x = R + F + M
    # By observing the data from RFM model
    if (x>=3) & (x<=5) :
        return "Platinum"
    elif (x>=6) & (x<=8) :
        return "Gold"
    elif (x>=9) & (x<=10) :
        return "Silver"
    elif (x>=11) & (x<=12) :
        return "Bronze"
    
def main():
    data=pd.DataFrame({'Recency':None , 'Frequency':None, 'Monetary':None},index=[0])
    print("\nE-MART CRM Portal \n To check the Customer's Membership Category please enter the below details:\n")
    print("Please enter Customer's Recency Score:")
    data['Recency']=int(input())
    print("Please enter Customer's Frequency Score:")
    data['Frequency']=int(input())
    print("Please enter Customer's Monetary Score:")
    data['Monetary']=int(input())
    print("************** Processing Data ****************\n")
    print("This customer is {} category customer.".format(pred_cust(data)) )   
    
main()    