In [None]:
# Lets import the library and read the dataset
import numpy as np
import datetime as dt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
!pip install openpyxl

        
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

df_ = pd.read_excel("../input/online-retail-ii-data-set-from-ml-repository/online_retail_II.xlsx", sheet_name = "Year 2010-2011" )
df = df_.copy()
df.head()

**Data Understanding and Preprocessing**

In [None]:
df.shape

In [None]:
# Are there any missing values?
df.isnull().any()
df.isnull().sum()

In [None]:
# Filling in missing data
df.dropna(inplace=True)

In [None]:
# Check again
df.isnull().sum()

*How many unique product are there?*

In [None]:
df["StockCode"].nunique()

*How many of each product are there?*

In [None]:
df["StockCode"].value_counts().head()


*Sort the 5 most ordered products from most to least.*

In [None]:
df["StockCode"].value_counts().sort_values(ascending=False).head()

*The 'C' in the invoices shows the canceled transactions. Since we will not use the canceled transactions, we should remove them.*

In [None]:
df = df[~df["Invoice"].str.contains("C", na = False)]

*Create a variable named 'TotalPrice' that represents the total earnings per invoice.*

In [None]:
df["TotalPrice"] = df["Quantity"] * df["Price"]

In [None]:
df.head()

**Calculation of RFM metrics**

**Recency :** the number of days between today_date and the last purchase date of this customer

**Frequency :** the number of purchase of this customer

**Monetary :** sum of TotalPrice of this customer

In [None]:
# the last date of purchase
df["InvoiceDate"].max()

In [None]:
# make sure that none of the Recency values become zero
import datetime as dt
today_date = dt.datetime(2011, 12, 11)

In [None]:
rfm = df.groupby('Customer ID').agg({'InvoiceDate': lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
                                     'Invoice': lambda Invoice: Invoice.nunique(),
                                     'TotalPrice': lambda TotalPrice: TotalPrice.sum()})


In [None]:
rfm.head()

*InvoiceDate represent to 'recency'
Invoice represent 'frequency'
TotalPrice represent 'monetary'*

**Lets change the columns names**

In [None]:
rfm.columns = ['recency', 'frequency', 'monetary']

In [None]:
rfm = rfm[rfm["monetary"] > 0]
rfm.head()

We need to score these values between 1 and 5. After scoring, we will segment it.

In [None]:
rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])


rfm["frequency_score"] = pd.qcut(rfm['frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])

rfm["monetary_score"] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])


rfm["RFM_SCORE"] = (rfm['recency_score'].astype(str) +
                    rfm['frequency_score'].astype(str))

In [None]:
rfm.head()

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}


rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)

rfm[["segment", "recency", "frequency", "monetary"]].groupby("segment").agg(["mean", "count"])

rfm.head()

Now, we anayze 3 segments which are champions, can't loose and need attention.

In [None]:
champions = rfm[rfm['segment'] == 'champions']
cant_loose = rfm[rfm['segment'] == 'cant_loose']

In [None]:
champions[['recency','frequency','monetary']].agg(['mean', 'count'])

We can say that 633 customers bought 6857.96 units by shopping 12 times approximately every 6 days.
It is the most special, most loved type of customer. Special calls can be made to these customers. Gift voucher can be defined. Campaigns can be made of buy 1 get 1 free.

In [None]:
cant_loose[['recency','frequency','monetary']].agg(['mean', 'count'])

63 customers bought 897.63 units by shopping twice every 132 days. They spend a good amount of money and they used to be our loyal customers, we can't loose them. There may be pop-ups like we miss you. Continuous reminder notifications can be sent. Special campaigns can be made for your previous shopping.

In [None]:
loyal_df = pd.DataFrame()
loyal_df["loyal_customer_id"] = rfm[rfm["segment"] == "loyal_customers"].index
loyal_df.head()

loyal_df.to_excel("loyal_customers.xlsx", sheet_name='Loyal Customers Index')