In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = '/kaggle/input/ecommerce-data/data.csv'
df = pd.read_csv(path)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['Country'].unique()

In [None]:
## There are some <= 0 quantities, we will remove them

df['Quantity'].unique()

In [None]:
df = df[df['Quantity']>0]
df.head()


In [None]:
df = df.dropna(subset = ['CustomerID'],how='all')
df.shape

In [None]:
df.isna().sum()

In [None]:
# Df is cleaned



In [None]:
#Preprocessing for RFM analysis
df['InvoiceDate'].max()

In [None]:
df['InvoiceDate'].min()

In [None]:
#Restrict to data to one full year for good RFM analysis

df = df[df['InvoiceDate']>= '2010-12-09']

In [None]:
df['InvoiceDate'].min()

In [None]:
df['InvoiceDate'].max()




## as you see our difference of min and max date is 1 year

In [None]:
df['InvoiceDate'] = pd.DatetimeIndex(df['InvoiceDate']).date
snapshot_day = df['InvoiceDate'].max()
df.head(3)




In [None]:
df['TotalSum'] = df['Quantity'] * df['UnitPrice']

df.head(3)

In [None]:
# Recency
df_recency = df.groupby('CustomerID', as_index = False)['InvoiceDate'].max()


In [None]:
df_recency['Recency'] = df_recency['InvoiceDate'].apply(lambda x : (snapshot_day - x).days)

df_recency = df_recency.drop('InvoiceDate',axis = 1)
df_recency.head()

In [None]:
data_process = df.groupby(['CustomerID']).agg({
        'InvoiceDate': lambda x: (snapshot_day - x.max()).days,
        'InvoiceNo': 'count',
        'TotalSum': 'sum'})
# Rename the columns 
data_process.rename(columns={'InvoiceDate': 'Recency',
                         'InvoiceNo': 'Frequency',
                         'TotalSum': 'MonetaryValue'}, inplace=True)
data_process

In [None]:
data_process = data_process.reset_index()

data_process['CustomerID'] = data_process['CustomerID'].astype('int64')

data_process.head()

In [None]:
r_label = range(4,0,-1)

r_groups = pd.qcut(data_process['Recency'], q = 4 , labels = r_label)

f_label = range(1,5)
f_groups = pd.qcut(data_process['Frequency'] , q = 4 , labels = f_label)

m_label = range(1,5)

m_groups = pd.qcut(data_process['MonetaryValue'],  q = 4 , labels = m_label)


data_process = data_process.assign(R = r_groups.values, F = f_groups.values, M = m_groups)
data_process.head()




In [None]:
# Concat RFM quartile values to create RFM Segments
def join_rfm(x): return str(int(x['R'])) + str(int(x['F'])) + str(int(x['M']))
data_process['RFM_Segment_Concat'] = data_process.apply(join_rfm, axis=1)
rfm = data_process
rfm.head()

In [None]:
def score(x): return int(x['R']) + int(x['F']) + int(x['M'])

data_process['score'] = rfm.apply(score,axis = 1)
data_process

In [None]:
def rfm_level(score):
    if  ((score >1) and (score < 4)):
        return 'bottom'
    elif ((score >3) and (score < 7)):
        return 'lower'
    elif ((score > 6) and (score <10)):
        return 'medium'
    else:
        return 'Top'

In [None]:
data_process['Level'] = data_process['score'].apply(lambda score : rfm_level(score))
data_process

In [None]:
data_process.groupby('Level').agg({
    'Recency' : 'mean',
    'Frequency' : 'mean',
    'MonetaryValue' : ['mean','count']
})