In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df= pd.read_csv('../input/onlineretail/OnlineRetail.csv')

In [None]:
df.head()

In [None]:
df.shape

### Drop the Invoice No. column as it is not needed

In [None]:
df.drop(['InvoiceNo'],axis='columns',inplace= True)

### find the basic informations using info() and describe() to understand the dataset better

In [None]:
df.info()

In [None]:
df.describe()

### count of null values in each column

In [None]:
df[df.columns[df.isnull().any()]].isnull().sum()

## Handling Null Values



### since the values that are null for description are null for both customer id and unitprice and the description has less missing values thus i will drop the rows will missing values in Description feature

In [None]:
df.dropna(subset=['Description','CustomerID'], how='all', inplace= True)

### removing the special characters from descritpion column to make it a little more consistant

In [None]:
df.Description = df.Description.str.replace('\W+', '')

In [None]:
df['Description']= df['Description'].str.lower()

### considering the negative quantities as 0 for rfm analysis

In [None]:
df['Quantity']= list(map((lambda x: 0 if(x<0) else x),df['Quantity']))

### creating a new column called price which will quantity x unitprice

In [None]:
df['Price']= df['Quantity'] * df['UnitPrice']

In [None]:
df.head()

## Let's derive a separate column for Date from Invoice Date

In [None]:
df['date'] = pd.to_datetime(df['InvoiceDate']).dt.date

In [None]:
df.head()

## Inorder to fix the date of analysis we will see the date range 

In [None]:
print(df['date'].min(), df['date'].max())

#### let's consider the analysis date as 2012-01-01 and lets calculate the recency

In [None]:
df['no_of_days']= (dt.date(2012,1,1))-df['date']

In [None]:
df['no_of_days']= df['no_of_days'].astype('timedelta64[D]')

In [None]:
df.head()

In [None]:
rfm = df.groupby('CustomerID').agg({'no_of_days': lambda x:x.min(),'CustomerID': lambda x: len(x),'Price': lambda x: x.sum()})     
rfm.rename(columns={'no_of_days': 'recency', 'CustomerID': 'frequency', 'Price': 'monetary'}, inplace=True)

In [None]:
rfm.head() #REMOVE ID FROM INDEX

In [None]:
rfm.describe()

### The scores are given as follows: for frequency and monetary 1 is the lowest and 4 the highest while for recency it is the opposite

### Recency

In [None]:
rfm["recency_score"] = pd.cut(rfm["recency"], bins=[-1,np.percentile(rfm["recency"], 25),np.percentile(rfm["recency"], 50), 
                                             np.percentile(rfm["recency"], 75),rfm["recency"].max()],labels=[4, 3, 2, 1]).astype("int")

### Frequency

In [None]:
rfm["frequency_score"] = pd.cut(rfm["frequency"], bins=[-1,np.percentile(rfm["frequency"], 25),np.percentile(rfm["frequency"], 50), 
                                             np.percentile(rfm["frequency"], 75),rfm["frequency"].max()],labels=[1, 2, 3, 4]).astype("int")

### Monetary

In [None]:
rfm["monetary_score"] = pd.cut(rfm["monetary"], bins=[-1,np.percentile(rfm["monetary"], 25),np.percentile(rfm["monetary"], 50), 
                                             np.percentile(rfm["monetary"], 75),rfm["monetary"].max()],labels=[1, 2, 3, 4])

In [None]:
rfm.head()

### Creating RFM class and Customer Segments

In [None]:
rfm['RFMClass']=0
rfm['Customer_Seg']=''

In [None]:
j=0
while j <= (len(rfm)-1):
    rfm['RFMClass'][j]= ((rfm['recency_score'].iloc[j])*100)+((rfm['frequency_score'].iloc[j])*10)+(rfm['monetary_score'].iloc[j])
    j+=1

In [None]:
from tabulate import tabulate

In [None]:
table= [['R','M','F1','F2','F3','F4'],['4','4','Unsteady','Active','Top customer','Top customer'],[' ','3','Unsteady','Active','Top customer','Top customer'],
       [' ','2','Unsteady','Active','Active','Active'],[' ','1','Unsteady','Active','Active','Active'],['3','4','Unsteady','Emerging','Emerging','Emerging'],
       [' ','3','Unsteady','Emerging','Emerging','Emerging'],[' ','2','Unsteady','Unsteady','Unsteady','Unsteady'],[' ','1','Unsteady','Unsteady','Unsteady','Unsteady'],
       ['2','4','At risk','At risk','At risk','At risk'],[' ',3,'At risk','At risk','At risk','At risk'],[' ','2','Potentially lost','Potentially lost','At risk','At risk'],
       [' ','2','Potentially lost','Potentially lost','At risk','At risk'],[' ','1','Potentially lost','Potentially lost','At risk','At risk'],
       ['1','4','Inactive','Inactive','At risk','At risk'],[' ','3','Inactive','Inactive','At risk','At risk'],[' ','2','Lost','Lost','At risk','At risk'],
       [' ','1','Lost','Lost','At risk','At risk']]
print('Customer Segmentation: ')
print(tabulate(table,headers='firstrow', tablefmt='fancy_grid'))

In [None]:
k=0
while k <= (len(rfm)-1):
    if rfm['recency_score'].iloc[k]== 4 and rfm['frequency_score'].iloc[k]>=3 and  rfm['monetary_score'].iloc[k]>=3:
        rfm['Customer_Seg'].iloc[k]= 'Top Customers'
    elif rfm['recency_score'].iloc[k]== 4 and rfm['frequency_score'].iloc[k]>=1 and  rfm['monetary_score'].iloc[k]>=1:
        rfm['Customer_Seg'].iloc[k]= 'Active Customers'
    elif rfm['recency_score'].iloc[k]== 3 and rfm['frequency_score'].iloc[k]>=2 and  rfm['monetary_score'].iloc[k]>=3:
        rfm['Customer_Seg'].iloc[k]= 'Emerging Customers'
    elif rfm['recency_score'].iloc[k]>= 3 and rfm['frequency_score'].iloc[k]==1 and  rfm['monetary_score'].iloc[k]>=1:
        rfm['Customer_Seg'].iloc[k]= 'Unsteady Customers'
    elif rfm['recency_score'].iloc[k]== 3 and rfm['frequency_score'].iloc[k]>=2 and  rfm['monetary_score'].iloc[k]<=2:
        rfm['Customer_Seg'].iloc[k]= 'Unsteady Customers'
    elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]>=1 and  rfm['monetary_score'].iloc[k]>=3:
        rfm['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]>=3 and  rfm['monetary_score'].iloc[k]<=2:
        rfm['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]>=3 and  rfm['monetary_score'].iloc[k]>=1:
        rfm['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm['recency_score'].iloc[k]== 2 and rfm['frequency_score'].iloc[k]<=2 and  rfm['monetary_score'].iloc[k]<=2:
        rfm['Customer_Seg'].iloc[k]= 'Potentially lost'
    elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]<=2 and  rfm['monetary_score'].iloc[k]>=3:
        rfm['Customer_Seg'].iloc[k]= 'Inactive Customers'
    elif rfm['recency_score'].iloc[k]== 1 and rfm['frequency_score'].iloc[k]<=2 and  rfm['monetary_score'].iloc[k]<=2:
        rfm['Customer_Seg'].iloc[k]= 'Lost'
    else:
        rfm['Customer_Seg'].iloc[k]= 'Uncategorized'
    k+=1       
       

In [None]:
rfm.head()

In [None]:
data = rfm.groupby("Customer_Seg")["RFMClass"].count()
data

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(10,10))
data.plot.pie(autopct="%.1f%%")

### Observation:
Significant number of customers have been lost amd are at Risk

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(15,5))
a= df.groupby("Country")["Price"].size().plot(kind = "bar")
a.set(xlabel="Country", ylabel = "Sale")

### observation:
Maximum number of transactions have taken place in UK

In [None]:
rfm['monetary_score']=rfm['monetary_score'].astype('int32')

## since the sale is highest in UK thus we can check Uk to analyze the data more details

In [None]:
uk= df[df['Country']=='United Kingdom']

In [None]:
rfm_uk = uk.groupby('CustomerID').agg({'no_of_days': lambda x:x.min(),'CustomerID': lambda x: len(x),'Price': lambda x: x.sum()})     
rfm_uk.rename(columns={'no_of_days': 'recency', 'CustomerID': 'frequency', 'Price': 'monetary'}, inplace=True)

In [None]:
rfm_uk.head()

In [None]:
rfm_uk['monetary']= rfm_uk['monetary'].astype('int')

In [None]:
rfm_uk.describe()

In [None]:
rfm_uk["recency_score"] = pd.cut(rfm_uk["recency"], bins=[-1,np.percentile(rfm_uk["recency"], 25),np.percentile(rfm_uk["recency"], 50), 
                                             np.percentile(rfm_uk["recency"], 75),rfm_uk["recency"].max()],labels=[4, 3, 2, 1]).astype("int")

In [None]:
rfm_uk["frequency_score"] = pd.cut(rfm_uk["frequency"], bins=[-1,np.percentile(rfm_uk["frequency"], 25),np.percentile(rfm_uk["frequency"], 50), 
                                             np.percentile(rfm_uk["frequency"], 75),rfm_uk["frequency"].max()],labels=[1, 2, 3, 4]).astype("int")

In [None]:
rfm_uk["monetary_score"] = pd.cut(rfm_uk["monetary"], bins=[-1,np.percentile(rfm_uk["monetary"], 25),np.percentile(rfm_uk["monetary"], 50), 
                                             np.percentile(rfm_uk["monetary"], 75),rfm_uk["monetary"].max()],labels=[1, 2, 3, 4])

In [None]:
rfm_uk

In [None]:
rfm_uk['RFMClass']=0
rfm_uk['Customer_Seg']=''

In [None]:
j=0
while j <= (len(rfm_uk)-1):
    rfm_uk['RFMClass'][j]= ((rfm_uk['recency_score'].iloc[j])*100)+((rfm_uk['frequency_score'].iloc[j])*10)+(rfm_uk['monetary_score'].iloc[j])
    j+=1

In [None]:
k=0
while k <= (len(rfm_uk)-1):
    if rfm_uk['recency_score'].iloc[k]== 4 and rfm_uk['frequency_score'].iloc[k]>=3 and  rfm_uk['monetary_score'].iloc[k]>=3:
        rfm_uk['Customer_Seg'].iloc[k]= 'Top Customers'
    elif rfm_uk['recency_score'].iloc[k]== 4 and rfm_uk['frequency_score'].iloc[k]>=1 and  rfm_uk['monetary_score'].iloc[k]>=1:
        rfm_uk['Customer_Seg'].iloc[k]= 'Active Customers'
    elif rfm_uk['recency_score'].iloc[k]== 3 and rfm_uk['frequency_score'].iloc[k]>=2 and  rfm_uk['monetary_score'].iloc[k]>=3:
        rfm_uk['Customer_Seg'].iloc[k]= 'Emerging Customers'
    elif rfm_uk['recency_score'].iloc[k]>= 3 and rfm_uk['frequency_score'].iloc[k]==1 and  rfm_uk['monetary_score'].iloc[k]>=1:
        rfm_uk['Customer_Seg'].iloc[k]= 'Unsteady Customers'
    elif rfm_uk['recency_score'].iloc[k]== 3 and rfm_uk['frequency_score'].iloc[k]>=2 and  rfm_uk['monetary_score'].iloc[k]<=2:
        rfm_uk['Customer_Seg'].iloc[k]= 'Unsteady Customers'
    elif rfm_uk['recency_score'].iloc[k]== 2 and rfm_uk['frequency_score'].iloc[k]>=1 and  rfm_uk['monetary_score'].iloc[k]>=3:
        rfm_uk['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm_uk['recency_score'].iloc[k]== 2 and rfm_uk['frequency_score'].iloc[k]>=3 and  rfm_uk['monetary_score'].iloc[k]<=2:
        rfm_uk['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm_uk['recency_score'].iloc[k]== 1 and rfm_uk['frequency_score'].iloc[k]>=3 and  rfm_uk['monetary_score'].iloc[k]>=1:
        rfm_uk['Customer_Seg'].iloc[k]= 'At Risk'
    elif rfm_uk['recency_score'].iloc[k]== 2 and rfm_uk['frequency_score'].iloc[k]<=2 and  rfm_uk['monetary_score'].iloc[k]<=2:
        rfm_uk['Customer_Seg'].iloc[k]= 'Potentially lost'
    elif rfm_uk['recency_score'].iloc[k]== 1 and rfm_uk['frequency_score'].iloc[k]<=2 and  rfm_uk['monetary_score'].iloc[k]>=3:
        rfm_uk['Customer_Seg'].iloc[k]= 'Inactive Customers'
    elif rfm_uk['recency_score'].iloc[k]== 1 and rfm_uk['frequency_score'].iloc[k]<=2 and  rfm_uk['monetary_score'].iloc[k]<=2:
        rfm_uk['Customer_Seg'].iloc[k]= 'Lost'
    else:
        rfm_uk['Customer_Seg'].iloc[k]= 'Uncategorized'
    k+=1       
       

In [None]:
rfm_uk['monetary_score']=rfm_uk['monetary_score'].astype('int32')

In [None]:
rfm_uk.info()

In [None]:
data1 = rfm_uk.groupby("Customer_Seg")["RFMClass"].count()
data1

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(10,10))
data1.plot.pie(autopct="%.1f%%")

In [None]:
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.subplots(figsize=(18,5))
sns.barplot(y='monetary',x= 'Customer_Seg',data= rfm_uk)