In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### RFM Analysis
----
**RFM (Recency, Frequency, Monetary)** analysis is a marketing technique used to determine quantitatively which customers are the best ones by examining *how recently a customer has purchased* (recency), *how often they purchase* (frequency), and *how much the customer spends* (monetary). Using RFM analysis, customers are assigned a ranking number of 1,2,3,4(with 4 being highest) for each RFM parameter. The three scores together are referred to as an RFM “cell”. The data is sorted to determine which customers were the best customers in the past, with a cell ranking of 444 being ideal.

import the dataset first 

In [None]:
df = pd.read_csv('/kaggle/input/sample-sales-data/sales_data_sample.csv', encoding='unicode_escape')
display(df.head(5))

Let's filter the columns we need. we will only need four columns. **CUSTOMERNAME** to group customers, **ORDERDATE** to calculate recency, **ORDERNUMBER** to calculate frequency, and **SALES** to calculate monetary.

In [None]:
cols = ['CUSTOMERNAME','ORDERDATE','ORDERNUMBER','SALES']
df = df[cols]
print(df.head(5))

### Recency:
----
To find recency we have to find the date of the last order of each customer made and subtract the value from the most recent order date in the dataset. The difference give us the recency value, for recent customer the value will be smaller.

In [None]:
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])

#group the data by CUSTOMERNAME and only retrive the ORDERDATE column 
recent_order = df.groupby('CUSTOMERNAME')['ORDERDATE'].max()

most_recent = df['ORDERDATE'].max()

def subtract_date(date):
    days = (most_recent - date).days
    return days

recency = recent_order.apply(subtract_date)

print('Recency days : ', recency)
#print(recent_order.head(10))
#print(df.groupby(['CUSTOMERNAME','ORDERDATE']).count().head())

### Frequency
----
Frequency is the measure how often a customer purchase a product. Now lets find out the number of times each customer has placed an order. 

In [None]:
frequency = df.groupby(['CUSTOMERNAME','ORDERNUMBER']).size()
frequency = frequency.groupby('CUSTOMERNAME').size()
print(frequency.head())

> size() and count() function both same but count() ignores missing/NaN/NULL values. 

### Monetary
----
Monetary is how each customer spent.

In [None]:
#groupby the CUSTOMERNAME and only retrive the SALES and sum
monetary = df.groupby('CUSTOMERNAME')['SALES'].sum()
print(monetary.head())

### Putting All Togeather 
----
Now we are going to put all these 3 parameter togeather.

In [None]:
rfm = pd.DataFrame()
rfm['recency'] = recency 
rfm['frequency'] = frequency 
rfm['monetary'] = monetary 

print(rfm.head())

Now we will convert these raw value into class, based on which quantile it fall into. 

In [None]:
quantile_df = rfm.quantile([0.25,0.50,0.75])
display(quantile_df)

In [None]:
def quantile_classes(x, quantile_value, attribute):
    if attribute == 'recency':
        if x <= quantile_value.loc[0.25,attribute]: # receny is less than 0.25%
            return '4'
        elif x >= quantile_value.loc[0.25,attribute] and x <= quantile_value.loc[0.50,attribute]: # recency is larger than 25%
            return '3'
        elif x >= quantile_value.loc[0.50,attribute] and x <= quantile_value.loc[0.75,attribute]:
            return '2'
        else:
            return '1'
    else:
        #frequncy and monetary 
        if x <= quantile_value.loc[0.25,attribute]: # frequncy/monetary is less than 0.25%
            return '1'
        elif x >= quantile_value.loc[0.25,attribute] and x <= quantile_value.loc[0.50,attribute]: # frequncy/monetary is larger than 25%
            return '2'
        elif x >= quantile_value.loc[0.50,attribute] and x <= quantile_value.loc[0.75,attribute]:
            return '3'
        else:
            return '4'
        

#convert rfm table raw value to class 
rfm['recency_class'] = rfm['recency'].apply(quantile_classes, args = (quantile_df,'recency'))
rfm['frequency_class'] = rfm['frequency'].apply(quantile_classes, args = (quantile_df,'frequency'))
rfm['monetary_class'] = rfm['monetary'].apply(quantile_classes, args = (quantile_df,'monetary'))

display(rfm.head())

combine all of these individual class into a single column. 

In [None]:
#join the string values
rfm['rfm_comb'] = rfm['recency_class'] + rfm['frequency_class'] + rfm['monetary_class']

#convert to numeric value 
rfm['rfm_comb'] = pd.to_numeric(rfm['rfm_comb'])

#sort values 
rfm = rfm.sort_values(by=['rfm_comb'], ascending=False)

#display top 10 customer 
display(rfm.head(10))