In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Exploration

In [None]:
Transaction_Data=pd.read_csv("../input/online-retail-ii-uci/online_retail_II.csv")
Transaction_Data

In [None]:
Transaction_Data.columns
Transaction_Data.info()

# Data Preparation

In [None]:
"""
1. remove invalid records: cancellation, no Customer ID, negative value in Quantity or Price
2. calculate new variable Revenue
3. drop unnecessary columns
"""

Transaction_Data_Cln=Transaction_Data.copy()

Transaction_Data_Cln=Transaction_Data_Cln[(Transaction_Data_Cln["Invoice"].str[0:1]!="C") & (Transaction_Data_Cln["Customer ID"].isnull()==False) & (Transaction_Data_Cln["Quantity"]>=0) & (Transaction_Data_Cln["Price"]>=0)]
Transaction_Data_Cln["Revenue"]=Transaction_Data_Cln["Quantity"]*Transaction_Data_Cln["Price"]
Transaction_Data_Cln=Transaction_Data_Cln[["Customer ID","InvoiceDate","Revenue"]]

Transaction_Data_Cln

In [None]:
pip install Lifetimes # Settings > Internet > enable

In [None]:
"""
1. transform transaction data to customer data
"""

from lifetimes.utils import summary_data_from_transaction_data

Customer_Data=summary_data_from_transaction_data(Transaction_Data_Cln, customer_id_col="Customer ID", datetime_col="InvoiceDate", monetary_value_col="Revenue", datetime_format="%Y-%m-%d %H:%M:%S", observation_period_end=None, freq="D", freq_multiplier=1, include_first_transaction=False)

Customer_Data

In [None]:
"""
* remove # & run the code for checking
"""

#temp=Transaction_Data_Cln[Transaction_Data_Cln["Customer ID"]==18285].groupby(["InvoiceDate"]).agg({"Revenue":["sum"]}).sort_values(by=["InvoiceDate"])
#temp

#temp=Transaction_Data_Cln.agg({"InvoiceDate":["max"]})
#temp

"""
1. frequency: 6 different InvoiceDate (freq="D") --> 6-1=5
2. recency: different between last InvoiceDate & 1st InvoiceDate --> 2011-10-28 - 2010-05-17=529
3. T: different between max InvoiceDate & 1st InvoiceDate --> 2011-12-09 - 2010-05-17=571
4. monetary_value: average value per transaction, excluding the 1st transaction --> (862+30.6+381.5+765.28+1001.32+70.68)/5=622.276
"""

# Model

In [None]:
"""
BG/NBD Model
1. calculate expected number of purchases
2. calculate probability of alive
"""

from lifetimes import BetaGeoFitter

BGF=BetaGeoFitter(penalizer_coef=0.0)
BGF.fit(Customer_Data["frequency"], Customer_Data["recency"], Customer_Data["T"])

#BGF
#BGF.summary

t=30
Customer_Data["ExpectedPurchases"]=BGF.conditional_expected_number_of_purchases_up_to_time(t, Customer_Data["frequency"], Customer_Data["recency"], Customer_Data["T"])
Customer_Data["ProbAlive"]=BGF.conditional_probability_alive(Customer_Data["frequency"], Customer_Data["recency"], Customer_Data["T"])
Customer_Data

In [None]:
"""
Gamma-Gamma Model
1. filter monetary_value>0
2. check assumption that no relationship between monetary_value & frequency
"""

Customer_Data_gt0=Customer_Data[Customer_Data["monetary_value"]>0]
Customer_Data_gt0[["monetary_value","frequency"]].corr()

In [None]:
"""
1. calculate CLV
"""

from lifetimes import GammaGammaFitter

GGF=GammaGammaFitter(penalizer_coef=0)
GGF.fit(Customer_Data_gt0["frequency"],Customer_Data_gt0["monetary_value"])

Customer_Data_gt0["CLV"]=GGF.customer_lifetime_value(BGF, Customer_Data_gt0["frequency"], Customer_Data_gt0["recency"], Customer_Data_gt0["T"], Customer_Data_gt0["monetary_value"], time=12, discount_rate=0.01, freq="D")
Customer_Data_gt0

# Visualization

In [None]:
"""
1. Frequency Recency Matrix
2. Probability Alive Matrix
"""

from lifetimes.plotting import plot_frequency_recency_matrix, plot_probability_alive_matrix

print(plot_frequency_recency_matrix(BGF))
print(plot_probability_alive_matrix(BGF))

    Reference
    1. https://lifetimes.readthedocs.io/en/latest/index.html