## With Rfm, you can compare your company's 2-year customer segmentation transitions, see your strengths and weaknesses, and take action. You can easily take more different actions.



In [None]:
import pandas as pd 
import numpy as np 
import datetime as dt 
import plotly.express as px # Visualization Operations
import warnings # Warnings

warnings.filterwarnings("ignore") # We ignored the warnings.
pd.set_option('display.expand_frame_repr', False)


In [None]:
def load_retail_data(pathname):
    """
    Function created to retrieve Online Retail II datasets.
    
     Parameters
     -----------
     pathname str, optional
         The index of the dataset.
    
     Returns
     -----------
     pd.DataFrame
         Returned dataset by index.
    """
    return pd.read_csv(pathname, sep=";")

In [None]:
def data_prep_for_rfm(dataframe):
    """
   The function that collects data pre-processing processes specific to Online Retail for RFM
    
     Parameters
     ------------
     dataframe pd.DataFrame
         Dataset for RFM analysis
    
     Returns
     ------------
     dataframe pd.DataFrame
         Data set prepared for RFM
    
    """
    dataframe.dropna(inplace=True)
    dataframe = dataframe[~(dataframe["Invoice"].str.contains("C"))]
    dataframe["Customer ID"] = dataframe["Customer ID"].astype(int).astype(str)
    dataframe["InvoiceDate"] = pd.to_datetime(dataframe["InvoiceDate"])
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    return dataframe

In [None]:

def create_rfm(dataframe:pd.DataFrame, dataframe_id:str, rfm_grid:dict, segment_list=False):
    """
   Function that generates the table for RFM Analysis.
    This function completes the operations in 4 steps.
        Step 1: Creating RFM metrics
            While creating the RFM metrics, the user determines the id value and R-F-M values to be grouped by himself.
        Step 2: Generating RFM scores.
            The metrics table is used when creating RFM scores. Values are divided into 5 parts according to percentages 0-20, 21-40, 41-60, 61-80 and 81-100.
            The rank() method is used to assign a label to the first of the values between two percentiles in frequency.
        Step 3: Creating the segments.
            Segments were created using RegEx structure according to R and F scores.
            Referenced source: https://guillaume-martin.github.io/rfm-segmentation-with-python.html
    
    Parameters
    -----------
    dataframe pd.DataFrame
        The data structure required to create the RFM table
    dataframe_id str
        The id column to deal with depending on the problem
    rfm_grid dict
        Aggregation dictionary to use depending on the problem
    
    Returns
    -----------
    rfm_table pd.DataFrame
        Generated RFM table.
    seg_map.values() list
        Segment list.
    
    Examples
    -----------
    >>> ...
    >>> analyse_date = dt.datetime(2011, 12, 11)
    >>> agg_dict = {"InvoiceDate": lambda recency: (analyse_date - recency.max()).days,
                    "Invoice": "nunique",
                    "TotalPrice": "sum"}
    >>> rfm = create_rfm(df, "Customer ID", agg_dict)
    >>> rfm.head()
         Customer ID  Recency  Frequency  Monetary recency_score frequency_score monetary_score RFM_SCORE       segment
    0       12346      326          1  77183.60             1               1              5        11      hibernating
    1       12347       40          7   4310.00             4               5              5        45  loyal_customers
    2       12348       76          4   1797.24             3               4              4        34  loyal_customers
    3       12349       19          1   1757.55             4               1              4        41        promising
    4       12350      311          1    334.40             1               1              2        11      hibernating
    """
    # Step 1: Creating RFM Metrics
    df_rfm = dataframe.groupby(dataframe_id).agg(rfm_grid)
    df_rfm.columns = ["Recency", "Frequency", "Monetary"]
    df_rfm = df_rfm[df_rfm["Monetary"] > 0]
    df_rfm.reset_index(inplace=True)
    
    # Step 2: Generating RFM Scores
    df_rfm["recency_score"] = pd.qcut(df_rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
    df_rfm["frequency_score"] = pd.qcut(df_rfm['Frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
    df_rfm["monetary_score"] = pd.qcut(df_rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
    
    # Step 3: Creating the segments
    df_rfm["RFM_SCORE"] = (df_rfm['recency_score'].astype(str) + df_rfm['frequency_score'].astype(str))
    seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
    }
    df_rfm['segment'] = df_rfm['RFM_SCORE'].replace(seg_map, regex=True)
    
    
    # Step 4: Return the RFM dataframe
    if segment_list:        
        return df_rfm, seg_map.values() 
    return df_rfm

In [None]:
online_retail_2009_2010_df = load_retail_data("../input/online-retail-ii-uci-two-peroid/online_retail_II_2009_2010.csv")
online_retail_2010_2011_df = load_retail_data("../input/online-retail-ii-uci-two-peroid/online_retail_II_2010_2011.csv")

In [None]:
online_retail_2009_2010_df = data_prep_for_rfm(online_retail_2009_2010_df)
online_retail_2010_2011_df = data_prep_for_rfm(online_retail_2010_2011_df)

In [None]:
online_retail_2009_2010_df['InvoiceDate'].max() ## 2010-12-09
analyse_date_2010 = online_retail_2009_2010_df['InvoiceDate'].max() + pd.DateOffset(days=2)


In [None]:
agg_dict_2010 = {"InvoiceDate": lambda recency: (analyse_date_2010 - recency.max()).days,
                 "Invoice": "nunique",
                 "TotalPrice": "sum"}
rfm_2009_2010 = create_rfm(online_retail_2009_2010_df, "Customer ID", agg_dict_2010)
rfm_2009_2010["period"] = "2009_2010"

In [None]:
online_retail_2010_2011_df['InvoiceDate'].max() ## 2011-12-09
analyse_date_2011 = online_retail_2010_2011_df['InvoiceDate'].max() + pd.DateOffset(days=2)


In [None]:
agg_dict_2011 = {"InvoiceDate": lambda recency: (analyse_date_2011 - recency.max()).days,
                 "Invoice": "nunique",
                 "TotalPrice": "sum"}
rfm_2010_2011 = create_rfm(online_retail_2010_2011_df, "Customer ID", agg_dict_2011)
rfm_2010_2011["period"] = "2010_2011"


In [None]:
temp_df = pd.concat([rfm_2009_2010[["segment", "period"]], rfm_2010_2011[["segment", "period"]]], ignore_index=True)
fig = px.treemap(temp_df, path=["segment", "period"], title="RFM Segments 2009-2010 vs. 2010-2011")
fig.show()


# let data be your best friend :D