In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install lifetimes --upgrade


In [None]:

# visualization

import matplotlib.pyplot as plt
import seaborn as sns

# sns.set_style('whitegrid')
color = sns.color_palette()


%matplotlib inline


import lifetimes

#Let's make this notebook reproducible 
np.random.seed(42)

import random
random.seed(42)

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Reading ecommerce data
data_file = '../input/customer_segmentation/customer_segmentation.csv'
ecommerce_data =pd.read_csv(data_file , engine="python" )

In [None]:
ecommerce_data.head()

In [None]:
ecommerce_data.info()

In [None]:
#change the column names

ecommerce_data.rename(index=str, columns={'InvoiceNo': 'invoice_num',
                              'StockCode' : 'stock_code',
                              'Description' : 'description',
                              'Quantity' : 'quantity',
                              'InvoiceDate' : 'invoice_date',
                              'UnitPrice' : 'unit_price',
                              'CustomerID' : 'cust_id',
                              'Country' : 'country'}, inplace=True)

# **Data Cleaning**

In [None]:
ecommerce_data.info()

## Check missing values for each column


In [None]:
# check missing values for each column 
ecommerce_data.isnull().sum().sort_values(ascending=False)

In [None]:
# check out the rows with missing values
ecommerce_data[ecommerce_data.isnull().any(axis=1)].head()

In [None]:
# change the invoice_date format - String to Timestamp format
ecommerce_data['invoice_date'] = pd.to_datetime(ecommerce_data.invoice_date, format='%m/%d/%Y %H:%M')
ecommerce_data.head()

## Remove rows with missing values


In [None]:
# ecommerce_new without missing values
ecommerce_new = ecommerce_data.dropna(axis=0)

In [None]:
# check missing values for each column 
ecommerce_new.isnull().sum().sort_values(ascending=False)

## Formate data

In [None]:
# change columns type - String to Int type 
ecommerce_new['cust_id'] = ecommerce_new['cust_id'].astype('int64')
ecommerce_new.head()

In [None]:
## Remove Quantity with negative values
ecommerce_new = ecommerce_new[ecommerce_new.quantity > 0]

In [None]:
ecommerce_new.describe().round(2)

## Add the column - amount_spent


In [None]:
# amount_spent = quantity ** unit_price
ecommerce_new['amount_spent'] = ecommerce_new['quantity'] * ecommerce_new['unit_price']

In [None]:
# rearrange all the columns for easy reference
ecommerce_new = ecommerce_new[['invoice_num','invoice_date','stock_code','description','quantity','unit_price','amount_spent','cust_id','country']]
ecommerce_new.head()

In [None]:
ecommerce_new.insert(loc=2, column='year_month', value=ecommerce_new['invoice_date'].map(lambda x: 100*x.year + x.month))
ecommerce_new.insert(loc=3, column='month', value=ecommerce_new.invoice_date.dt.month)
# +1 to make Monday=1.....until Sunday=7
ecommerce_new.insert(loc=4, column='day', value=(ecommerce_new.invoice_date.dt.dayofweek)+1)
ecommerce_new.insert(loc=5, column='hour', value=ecommerce_new.invoice_date.dt.hour)
ecommerce_new.tail()


In [None]:
ecommerce_new.columns

In [None]:
elog = ecommerce_new[['cust_id','invoice_date']]
display(elog.sample(5))

#### >  Date range of orders

In [None]:
elog.invoice_date.describe()

In [None]:
ecommerce_new.tail()

## Creating RFM Matrix based on transaction log
### Spliting calibration and holdout period

In [None]:
%%time
from lifetimes.utils import calibration_and_holdout_data

calibration_period_ends = '2011-9-09 12:50:00'


summary_cal_holdout = calibration_and_holdout_data(elog, 
                                                   customer_id_col = 'cust_id', 
                                                   datetime_col = 'invoice_date', 
                                                   freq = 'D', #days
                                        calibration_period_end=calibration_period_ends,
                                        observation_period_end='2011-12-09 12:50:00' )

## Feature set


In [None]:
print (summary_cal_holdout.head())

## Training model - MBG/NBD
Model assumptions:

* While active, the number of transactions made by a customer follows a Poisson process with transaction rate  λ .
* Heterogeneity in  λ  across customers follows a Gamma distribution with shape parameter  r  and scale parameter  α .
* At time zero and right after each purchase the customer becomes inactive with a constant probability  p .
* Heterogeneity in  p  across customers follows a Gamma distribution with parameter  a  and  b .
* The transaction rate  λ  and the dropout probability  p  vary independently across customers.

In [None]:
%%time 

from lifetimes import ModifiedBetaGeoFitter

mbgnbd = ModifiedBetaGeoFitter(penalizer_coef=0.01)
mbgnbd.fit(summary_cal_holdout['frequency_cal'], 
        summary_cal_holdout['recency_cal'], 
        summary_cal_holdout['T_cal'],
       verbose=True)

In [None]:
print(mbgnbd)


### Estimating customer lifetime value using the Gamma-Gamma model
The Gamma-Gamma model and the independence assumption:

Model assumes that there is no relationship between the monetary value and the purchase frequency. In practice we need to check whether the Pearson correlation between the two vectors is close to 0 in order to use this model.

## Predictions for each customer


In [None]:
t = 90 # days to predict in the future 
summary_cal_holdout['predicted_purchases'] = mbgnbd.conditional_expected_number_of_purchases_up_to_time(t, 
                                                                                      summary_cal_holdout['frequency_cal'], 
                                                                                      summary_cal_holdout['recency_cal'], 
                                                                                      summary_cal_holdout['T_cal'])

summary_cal_holdout['p_alive'] = mbgnbd.conditional_probability_alive(summary_cal_holdout['frequency_cal'], 
                                                                         summary_cal_holdout['recency_cal'], 
                                                                         summary_cal_holdout['T_cal'])
summary_cal_holdout['p_alive'] = np.round(summary_cal_holdout['p_alive'] / summary_cal_holdout['p_alive'].max(), 2)

#summary_cal_holdout['clv'] = gg.customer_lifetime_value(
#    mbgnbd, #the model to use to predict the number of future transactions
#    summary_cal_holdout['frequency_cal'],
#    summary_cal_holdout['recency_cal'],
#    summary_cal_holdout['T_cal'],
#    summary_cal_holdout['monetary_value_cal'],
#    time=3, # months
#    discount_rate=0 #0.0025 # = 0.03/12 monthly discount rate ~ 3% annually
#)
#summary_cal_holdout['clv'] += (-1*summary_c

In [None]:
display(summary_cal_holdout.sample(2).T)

## Model evaluation
Accessing model fit

In [None]:
%%time 

from lifetimes.plotting import plot_period_transactions
ax = plot_period_transactions(mbgnbd, max_frequency=7)
ax.set_yscale('log')
sns.despine();

In [None]:
%%time 

from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases

plot_calibration_purchases_vs_holdout_purchases(mbgnbd, summary_cal_holdout)
sns.despine();

### Customer Probability History


In [None]:
from lifetimes.plotting import plot_history_alive
from datetime import date
from pylab import figure, text, scatter, show

individual = summary_cal_holdout.iloc[400]

id = individual.name
t = 365*50

today = date.today()
two_year_ago = today.replace(year=today.year - 2)
one_year_from_now = today.replace(year=today.year + 1)

sp_trans = elog.loc[elog['cust_id'] == id]

from lifetimes.utils import calculate_alive_path

t = (today - sp_trans.invoice_date.min().date()).days
p_alive_today = pd.DataFrame(calculate_alive_path(mbgnbd, sp_trans, 'invoice_date', t, freq='D'))[0].tail(1).values
p_alive_today = np.round(p_alive_today[0], 2)
print('Probability that customer is alive today is', p_alive_today)

t = (one_year_from_now - sp_trans.invoice_date.min().date()).days
ax = plot_history_alive(mbgnbd, t, sp_trans, 'invoice_date', start_date=two_year_ago) #, start_date='2016-01-01'
ax.vlines(x=today, ymin=0, ymax=1.05, colors='#4C4C4C')
ax.hlines(y=0.8, xmin=two_year_ago, xmax=one_year_from_now, colors='#4C4C4C')

ax.set_xlim(two_year_ago, one_year_from_now) # sp_trans.ORDER_DATE.min()
ax.set_ylim(0, 1.05)

plt.xticks(rotation=-90)
text(0.75, 0.1, p_alive_today, ha='center', va='center', transform=ax.transAxes)

sns.despine()

### Predicted Transactions with Time


In [None]:
elog.columns = ['cust_id', 'invoice_date']

In [None]:
%%time
# Get expected and actual repeated cumulative transactions.

from lifetimes.utils import expected_cumulative_transactions

t = (elog.invoice_date.max() - elog.invoice_date.min()).days
df = expected_cumulative_transactions(mbgnbd, elog, 'invoice_date', 'cust_id', t)

In [None]:
df.tail()


In [None]:
%%time
# Calibration period = 2016-09-04 to 2017-09-30
from datetime import datetime

cal = datetime.strptime('2018-06-30', '%Y-%m-%d')

from lifetimes.plotting import plot_cumulative_transactions
t = (elog.invoice_date.max() - elog.invoice_date.min()).days
t_cal = (cal - elog.invoice_date.min()).days
plot_cumulative_transactions(mbgnbd, elog, 'invoice_date', 'cust_id', t, t_cal, freq='D')
sns.despine()

In [None]:
%%time 

from lifetimes.plotting import plot_incremental_transactions
plot_incremental_transactions(mbgnbd, elog, 'invoice_date', 'cust_id', t, t_cal, freq='D')
sns.despine()

Predict the conditional, expected average lifetime value of our customers.
Model performance will increase if it is trained on all the data and not a sample as is the case here...

Cheers..