In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

###############################################################
# Business Problem
###############################################################

* An e-commerce company wants to segment its customers and determine marketing strategies according to these segments.

###############################################################
# Data Understanding
###############################################################

Attribute Information:

* InvoiceNo: Invoice number. Nominal. A 6-digit integral number uniquely assigned to each transaction. If this code starts with the letter 'c', it indicates a cancellation.
* StockCode: Product (item) code. Nominal. A 5-digit integral number uniquely assigned to each distinct product.
* Description: Product (item) name. Nominal.
* Quantity: The quantities of each product (item) per transaction. Numeric.
* InvoiceDate: Invice date and time. Numeric. The day and time when a transaction was generated.
* UnitPrice: Unit price. Numeric. Product price per unit in USD.
* CustomerID: Customer number. Nominal. A 5-digit integral number uniquely assigned to each customer.
* Country: Country name. Nominal. The name of the country where a customer resides.

In [None]:
df_ = pd.read_csv("../input/online-retail-ii-uci/online_retail_II.csv")
df = df_.copy() #this code protects df_ if we need it we can use it again.
df.head()

In [None]:
# Let's examine the general structural features
print("##################### Index #####################")
print(df.index)
print("##################### Shape #####################")
print(df.shape)
print("##################### Types #####################")
print(df.dtypes)
print("##################### NA #####################")
print(df.isnull().sum())
print("##################### Total NA #####################")
print(df.isnull().sum().sum())
print("##################### Describe #####################")
print(df.describe().T)

In [None]:
# We do not need to struggle with the NA for this analysis so I have dropped them
df.dropna(inplace=True)

In [None]:
# If this code starts with the letter 'c', it indicates a cancellation. I took out them
df = df[~df["Invoice"].str.contains("C", na=False)] 

In [None]:
# We calculate the total price for monetary value
df["TotalPrice"] = df["Quantity"] * df["Price"]

In [None]:
today_date = dt.datetime(2011, 12, 11)

In [None]:
df.info()

In [None]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]) 

In [None]:
df.info()

In [None]:
rfm = df.groupby("Customer ID").agg({"InvoiceDate": lambda x: (today_date - x.max()).days,
                                    "Invoice": lambda x: x.nunique(),
                                    "TotalPrice": lambda x: x.sum()})

In [None]:
rfm = rfm[rfm["TotalPrice"] > 0]

In [None]:
rfm.columns = ["Recency", "Frequency", "Monetary"] 

In [None]:
rfm["recency_score"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])

In [None]:
rfm["frequency_score"] = pd.qcut(rfm["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])

In [None]:

rfm["monetary_score"] = pd.qcut(rfm["Monetary"], 5, labels=[1, 2, 3, 4, 5])


In [None]:
rfm["RFM_SCORE"] = (rfm['recency_score'].astype(str) +
                    rfm['frequency_score'].astype(str) + rfm['monetary_score'].astype(str))

In [None]:
rfm.head()

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}

In [None]:
rfm['segment'] = (rfm['recency_score'].astype(str) +rfm['frequency_score'].astype(str)).replace(seg_map, regex=True)

In [None]:
rfm.head()

In [None]:
from IPython.display import display, HTML

display(HTML(rfm[["segment", "Recency", "Frequency", "Monetary"]].groupby("segment").agg(["mean", "median", "count"]).to_html()))

In [None]:
rfm[["segment", "Recency", "Frequency", "Monetary"]].groupby("segment").agg(["mean", "median", "count"])

In [None]:
rfm.groupby("segment").agg({'segment': 'count',
                            'Recency': ['mean', 'median', 'min', 'max'],
                            'Frequency': ['mean', 'median', 'min', 'max'],
                            'Monetary': ['mean', 'median', 'min', 'max']}).\
    sort_values(by=('segment', 'count'), ascending=False, axis=0)