In [None]:
!pip install Lifetimes
!pip install squarify
!pip install plotly

: 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lifelines import CoxPHFitter
from pandas_profiling import ProfileReport
import datetime as dt
import plotly.express as px
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.plotting import *
from lifetimes.utils import *
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes.plotting import plot_probability_alive_matrix
from lifetimes.plotting import plot_frequency_recency_matrix
from lifetimes.plotting import plot_period_transactions
from lifetimes.utils import calibration_and_holdout_data
import squarify

: 

# Data Preperation

In [None]:
data = pd.read_excel("Dataset.xlsx")

: 

In [None]:
data['InvoiceDate'].agg(['min', 'max'])

: 

In [None]:
data.head()

: 

In [None]:
fd = data.drop_duplicates()
fd = fd [['Customer ID','Description','InvoiceDate','Invoice','Quantity','Price', 'Country']]
fd = fd[(fd['Quantity']>0)]
fd['TotalPurchase'] = fd['Quantity'] * fd['Price']

: 

In [None]:
df_plot_bar = fd.groupby('Description').agg({'TotalPurchase':'sum'}).sort_values(by = 'TotalPurchase', ascending=False).reset_index().head(5)
df_plot_bar['Percent'] = round((df_plot_bar['TotalPurchase'] / df_plot_bar['TotalPurchase'].sum()) * 100,2)
fir_plotbar = px.bar(df_plot_bar, y='Percent', x='Description', title='Top selling products', 
text='Percent', color='Percent')
fir_plotbar.update_traces(texttemplate='%{text:.2s}', textposition='inside')
fir_plotbar.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(1, 0, 0, 0)',
})
fir_plotbar.update_layout(uniformtext_minsize=8, uniformtext_mode='hide',showlegend=False)                
            

: 

In [None]:
df_plot = fd.groupby(['Country','Description','Price','Quantity']).agg({'TotalPurchase': 'sum'},{'Quantity':'sum'}).reset_index()
fig_miricle = px.scatter(df_plot[:25000], x="Price", y="Quantity", color = 'Country', 
        size='TotalPurchase',  size_max=20, log_y= True, log_x= True, title= "PURCHASE TREND ACROSS COUNTRIES")
fig_miricle.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(1, 0, 0, 0)',
})
fig_miricle.show()

: 

In [None]:
import plotly.graph_objects as go
fig = go.Figure([go.Scatter(x=fd['InvoiceDate'], y=fd['Quantity'])])
fig.show()

: 

In [None]:
from wordcloud import WordCloud
text = " ".join(review for review in data.Country.astype(str))
x, y = np.ogrid[:300, :00]
#mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
#mask = 560 * mask.astype(int)
wc = WordCloud(background_color="white", repeat=True, mask=mask, width=1600, height=800,  colormap='Dark2',)
wc.generate(text)
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.show()


: 

In [None]:
new = summary_data_from_transaction_data(fd, 'Customer ID', 'InvoiceDate', monetary_value_col='TotalPurchase', observation_period_end='2011-12-9')
new.head()

: 

In [None]:
new['percent'] = round((new['frequency'] / new['frequency'].sum()) * 100,2)
fir_plot = px.bar(new, y=new['percent'], x=new['frequency'], title='Frequency BarChart', color='percent')
fir_plot.show()

: 

In [None]:
fdg = fd.groupby(['Customer ID','Country']).agg({'InvoiceDate': lambda date: (date.max() - date.min()).days,
                                                 'Quantity': lambda quant: quant.sum(),
                                                 'Invoice': lambda num: len(num),
                                                 'TotalPurchase': lambda price: price.sum()    })

: 

In [None]:
fdg.columns=['num_days','num_transactions','num_units','spent_money']
fdg['avg_order_value'] = fdg['spent_money']/fdg['num_transactions']
purchase_frequency = sum(fdg['num_transactions'])/4319
repeat_rate = round(fdg[fdg.num_transactions > 1].shape[0]/fdg.shape[0],2)
churn_rate = round(1-repeat_rate,2)

: 

In [None]:
fdg.reset_index()

: 

In [None]:
fdg['profit_margin'] = fdg['spent_money']*0.05
fdg['CLV'] = (fdg['avg_order_value']*purchase_frequency)/churn_rate
fdg.reset_index(inplace = True)
fdg['spent_money', 'avg_order_value','profit_margin'] = fdg.spent_money.apply(lambda x : "{:,}".format(x))
data.dropna(inplace=True)
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])
data["InvoiceDate"].max()
today_date = dt.datetime(2011, 12, 11)
data["TotalPrice"] = data["Price"] * data["Quantity"]





: 

In [None]:
#np.log(fdg.CLV).describe()
purchase_frequency,repeat_rate,churn_rate


: 

In [None]:
fdg.head()

: 

In [None]:
rfm = data.groupby("Customer ID").agg({"InvoiceDate": lambda InvıiceDate: (today_date- InvıiceDate.max()).days,
                                       "Invoice": lambda Invoice: Invoice.nunique(),
                                       "TotalPrice": lambda TotalPrice: TotalPrice.sum()})
rfm.columns = ["recency","frequency","monetary"]
rfm = rfm[rfm["monetary"] > 0]
rfm.describe().T

: 

In [None]:
rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])
rfm["frequency_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
rfm["monetary_score"] = pd.qcut(rfm["monetary"], 5, labels=[1, 2, 3, 4, 5])
rfm["RFM_SCORE"] = (rfm["recency_score"].astype(str) + rfm["frequency_score"].astype(str))

: 

In [None]:
seg_map = {
    r'[1-2][1-2]': 'HIBERNATING',
    r'[1-2][3-4]': 'AT RISK',
    r'[1-2]5': 'CANT LOSE',
    r'3[1-2]': 'ABOUT TO SLEEP',
    r'33': 'NEED ATTENTION',
    r'[3-4][4-5]': 'LOYAL CUSTOMER',
    r'41': 'PROMISING',
    r'51': 'NEW CUSTOMERS',
    r'[4-5][2-3]': 'POTENTIAL LOYALIST',
    r'5[4-5]': 'CHAMPIONS'
}
rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)
rfm.head(10)

: 

In [None]:
rfm[["segment", "recency", "frequency", "monetary"]].groupby("segment").agg(["mean", "count"])

: 

In [None]:
#!pip install pyecharts

: 

In [None]:
from pyecharts.charts import Pie
from pyecharts import options as opts
sgm= rfm["segment"].value_counts()
c = sgm.index.tolist()
d = sgm.tolist()
color_series = ['#5b9aa0','#667292','#8d9db6','#daebe8','#d6d4e0',
 '#e4d1d1','#b0aac0','#f9d5e5','#b9b0b0','#622569']
rosechart = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
rosechart.set_colors(color_series)
rosechart.add("", [list(z) for z in zip(c, d)],
        radius=["20%", "95%"], 
        center=["30%", "60%"],
        rosetype="area")
rosechart.set_global_opts(title_opts=opts.TitleOpts(title='CLV',subtitle="Customer Segmentation"),
                     legend_opts=opts.LegendOpts(is_show=False),
                     toolbox_opts=opts.ToolboxOpts())
rosechart.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position="inside", font_size=12,formatter="{b}:{c}", font_style="italic",font_weight="bold", font_family="Century"),)
rosechart.render_notebook()
        



: 

In [None]:
df_treemap = rfm.groupby('segment').agg('count').reset_index()
fig, ax = plt.subplots(1, figsize = (20,10))
squarify.plot(sizes=df_treemap['RFM_SCORE'], 
              label=df_treemap['segment'], 
              color=['#5b9aa0','#667292','#8d9db6','#daebe8','#d6d4e0',
                     '#e4d1d1','#b0aac0','#f9d5e5','#b9b0b0','#602969'])
plt.axis('off')
plt.show()

: 

# Model

In [None]:
#BG/NBD Model 
cltv_df = data.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max() - date.min()).days,
                                                           lambda date: (today_date - date.min()).days],
                                           'Invoice':      lambda num: num.nunique(),
                                           'TotalPrice':   lambda TotalPrice: TotalPrice.sum()})

cltv_df.columns = cltv_df.columns.droplevel(0)
cltv_df.columns = ['recency', 'T', 'frequency', 'monetary']
cltv_df["monetary"] = cltv_df["monetary"] / cltv_df["frequency"]
cltv_df = cltv_df[cltv_df["monetary"] > 0]
cltv_df["recency"] = cltv_df["recency"] / 7
cltv_df["T"] = cltv_df["T"] / 7
cltv_df = cltv_df[(cltv_df['frequency'] > 1)]

: 

In [None]:
data["InvoiceDate"] = pd.to_datetime(data["InvoiceDate"])

: 

In [None]:
cltv_df.head()

: 

In [None]:
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(cltv_df['frequency'],
        cltv_df['recency'],
        cltv_df['T'])

: 

In [None]:
fig = plt.figure(figsize=(12,8))
plot_frequency_recency_matrix(bgf)

: 

In [None]:
fig = plt.figure(figsize=(12,8))
plot_probability_alive_matrix(bgf)

: 

In [None]:
summary_cal_holdout = calibration_and_holdout_data(fd, 'Customer ID', 'InvoiceDate',
                                        calibration_period_end='2010-11-09',
                                        observation_period_end='2011-01-01' )   
print(summary_cal_holdout.head())

: 

In [None]:
bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)

: 

In [None]:
cltv_df["expected_purc_1_week"] = bgf.predict(1,cltv_df['frequency'],cltv_df['recency'],cltv_df['T'])
cltv_df["expected_purc_1_month"] = bgf.predict(4,cltv_df['frequency'],cltv_df['recency'],cltv_df['T'])
cltv_df.sort_values("expected_purc_1_week", ascending=False)
cltv_df.sort_values("expected_purc_1_month", ascending=False)

: 

# Gamma model

In [None]:
ggf = GammaGammaFitter(penalizer_coef=0.1)
ggf.fit(cltv_df['frequency'], cltv_df['monetary'])
fd

: 

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
def evaluate_clv(actual, predicted, bins):
    print(f"Average absolute error: {mean_absolute_error(actual, predicted)}")
    #Evaluate numeric
    plt.figure(figsize=(10, 7))
    plt.scatter(predicted, actual)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Predicted vs Actual')
    plt.show()
    
    #Evaluate Bins
    est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='kmeans')
    est.fit(np.array(actual).reshape(-1, 1))
    actual_bin = est.transform(np.array(actual).reshape(-1, 1)).ravel()
    predicted_bin = est.transform(np.array(predicted).reshape(-1, 1)).ravel()
    
    cm = confusion_matrix(actual_bin, predicted_bin, normalize='true')
    df_cm = pd.DataFrame(cm, index = range(1, bins+1),
                      columns = range(1, bins+1))
    plt.figure(figsize = (20,10))
    sns.heatmap(df_cm, annot=True)

    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() # discover the values for bottom and top
    b += 0.5 # Add 0.5 to the bottom
    t -= 0.5 # Subtract 0.5 from the top
    plt.ylim(b, t) # update the ylim(bottom, top) values
    plt.show()
    print(f'F1 score: {f1_score(actual_bin, predicted_bin, average="macro")}')
    print('Samples in each bin: \n')
    print(pd.Series(actual_bin).value_counts())



: 

In [None]:
summary_cal_holdout = calibration_and_holdout_data(fd, 'Customer ID', 'InvoiceDate',
                                        calibration_period_end='2010-11-09',
                                      observation_period_end='2011-01-01',
                                                   monetary_value_col = 'TotalPurchase') 
summary_cal_holdout
summary_cal_holdout = summary_cal_holdout[(summary_cal_holdout['monetary_value_cal']>0)]

    


ggf.fit(summary_cal_holdout['frequency_cal'],
        summary_cal_holdout['monetary_value_cal'])
monetary_pred = ggf.conditional_expected_average_profit(summary_cal_holdout['frequency_holdout'],
                                        summary_cal_holdout['monetary_value_holdout'])
evaluate_clv(summary_cal_holdout['monetary_value_holdout'], monetary_pred, bins=10)


: 

In [None]:
bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])

#Predict
predicted_bgf = bgf.predict(30, #how many days to predict
                        summary_cal_holdout['frequency_cal'], 
                        summary_cal_holdout['recency_cal'], 
                        summary_cal_holdout['T_cal'])
evaluate_clv(summary_cal_holdout['frequency_holdout'], predicted_bgf, bins=10)




: 

In [None]:
cltv_df["expected_average_profit"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],cltv_df['monetary'])
cltv_df.sort_values("expected_average_profit", ascending=False)

: 

In [None]:
cltv = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=6, 
                                   freq="W",
                                   discount_rate=0.01)

: 

In [None]:
cltv = cltv.reset_index()
cltv_final = cltv_df.merge(cltv, on="Customer ID", how="left")
cltv_final.sort_values(by="clv", ascending=False).head(10)

: 

In [None]:
# 1 Month CLTV:
cltv_1 = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=1,  # 1 month
                                   freq="W",  # frequency of T
                                   discount_rate=0.01)
cltv_1= cltv_1.reset_index()
cltv_1 = cltv_df.merge(cltv_1, on="Customer ID", how="left")
cltv_1.sort_values(by="clv", ascending=False).head(5)

: 

In [None]:
cltv_12 = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=12, 
                                   freq="W",  
                                   discount_rate=0.01)
cltv_12 = cltv_12.reset_index()
cltv_12 = cltv_df.merge(cltv_12, on="Customer ID", how="left")
cltv_12.sort_values(by="clv", ascending=False).head(5)

: 

: 

: 

: 

: 

: 

: 

: 