## **Predicting Next Purchase Day for a Customer**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
import datetime as dt
import os
import time

In [3]:
import lifetimes.plotting as lp
import lifetimes.utils as lu
import lifetimes.fitters as lf

In [4]:
plt.style.use('dark_background')
mpl.rcParams['figure.figsize'] = (12, 6)

In [5]:
raw_folder = '/home/sid/mystuff/myprogs/flirt/projects/product_analytics/customer_segmentation/data/raw'
datapath = os.path.join(raw_folder, 'online_retail.xlsx')

In [6]:
df = pd.read_excel(datapath, parse_dates=['InvoiceDate'], engine='openpyxl')

In [7]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [23]:
tx_uk = df.query('Country=="United Kingdom"').reset_index(drop=True)

In [24]:
tx_uk.shape

(495478, 8)

In [25]:
tx_uk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495478 entries, 0 to 495477
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    495478 non-null  object        
 1   StockCode    495478 non-null  object        
 2   Description  494024 non-null  object        
 3   Quantity     495478 non-null  int64         
 4   InvoiceDate  495478 non-null  datetime64[ns]
 5   UnitPrice    495478 non-null  float64       
 6   CustomerID   361878 non-null  float64       
 7   Country      495478 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 30.2+ MB


In [26]:
#tx_uk = tx_uk[~tx_uk['CustomerID'].isnull()]
#tx_uk.shape

In [27]:
tx_6m = tx_uk[(tx_uk['InvoiceDate'] < dt.datetime(2011, 9, 1)) 
                 & (tx_uk['InvoiceDate'] >= dt.datetime(2011, 3, 1))
             ].reset_index(drop=True)

In [28]:
tx_6m.sample(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
194324,564850,85099C,JUMBO BAG BAROQUE BLACK WHITE,40,2011-08-30 16:43:00,2.08,17298.0,United Kingdom
11407,546417,22745,POPPY'S PLAYHOUSE BEDROOM,5,2011-03-13 10:48:00,2.1,14800.0,United Kingdom
62737,551696,23207,LUNCH BAG ALPHABET DESIGN,10,2011-05-03 13:39:00,1.65,14110.0,United Kingdom
27146,547888,22927,GREEN GIANT GARDEN THERMOMETER,4,2011-03-28 09:59:00,5.95,13368.0,United Kingdom
186650,564221,22513,DOORSTOP FOOTBALL DESIGN,1,2011-08-24 09:36:00,7.46,,United Kingdom
1780,545447,20972,PINK CREAM FELT CRAFT TRINKET BOX,12,2011-03-02 16:34:00,1.25,17426.0,United Kingdom
46088,550006,22980,PANTRY SCRUBBING BRUSH,1,2011-04-14 10:10:00,1.65,17841.0,United Kingdom
32411,548493,22178,VICTORIAN GLASS HANGING T-LIGHT,96,2011-03-31 13:14:00,1.06,16839.0,United Kingdom
183417,563909,23170,REGENCY TEA PLATE ROSES,3,2011-08-21 12:36:00,1.65,15701.0,United Kingdom
54760,550711,22189,CREAM HEART CARD HOLDER,108,2011-04-20 11:06:00,2.31,18102.0,United Kingdom


In [29]:
tx_next = tx_uk[(tx_uk['InvoiceDate'] >= dt.datetime(2011, 9, 1))
                   & (tx_uk['InvoiceDate'] < dt.datetime(2011, 12, 1))
               ].reset_index(drop=True)

In [30]:
tx_user = pd.DataFrame(tx_6m['CustomerID'].unique())
tx_user.columns = ['CustomerID']
tx_user.head()

Unnamed: 0,CustomerID
0,14620.0
1,14740.0
2,13880.0
3,16462.0
4,17068.0


### Feature Engineering

In [34]:
tx_day_order = tx_6m[['CustomerID', 'InvoiceDate']]

In [35]:
## convert Invoice Datetime to day
tx_day_order['InvoiceDay'] = tx_6m['InvoiceDate'].dt.date

In [36]:
tx_day_order = tx_day_order.sort_values(['CustomerID', 'InvoiceDate'])

In [37]:
## drop duplicates
tx_day_order = tx_day_order.drop_duplicates(
    subset=['CustomerID', 'InvoiceDay'], 
    keep='first')

In [39]:
## shifting last three purchase dates
tx_day_order['PrevInvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(1)

In [40]:
tx_day_order.head()

Unnamed: 0,CustomerID,InvoiceDate,InvoiceDay,PrevInvoiceDate
649,12747.0,2011-03-01 14:53:00,2011-03-01,
65091,12747.0,2011-05-05 15:31:00,2011-05-05,2011-03-01
90473,12747.0,2011-05-25 09:57:00,2011-05-25,2011-05-05
124699,12747.0,2011-06-28 10:06:00,2011-06-28,2011-05-25
184410,12747.0,2011-08-22 10:38:00,2011-08-22,2011-06-28


In [41]:
tx_day_order['T2InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(2)
tx_day_order['T3InvoiceDate'] = tx_day_order.groupby('CustomerID')['InvoiceDay'].shift(3)

In [42]:
tx_day_order

Unnamed: 0,CustomerID,InvoiceDate,InvoiceDay,PrevInvoiceDate,T2InvoiceDate,T3InvoiceDate
649,12747.0,2011-03-01 14:53:00,2011-03-01,,,
65091,12747.0,2011-05-05 15:31:00,2011-05-05,2011-03-01,,
90473,12747.0,2011-05-25 09:57:00,2011-05-25,2011-05-05,2011-03-01,
124699,12747.0,2011-06-28 10:06:00,2011-06-28,2011-05-25,2011-05-05,2011-03-01
184410,12747.0,2011-08-22 10:38:00,2011-08-22,2011-06-28,2011-05-25,2011-05-05
...,...,...,...,...,...,...
186521,,2011-08-24 09:00:00,2011-08-24,,,
188739,,2011-08-25 13:57:00,2011-08-25,,,
189548,,2011-08-26 11:47:00,2011-08-26,,,
191270,,2011-08-30 10:29:00,2011-08-30,,,


##### **Let’s begin calculating the difference in days for each invoice date**

In [43]:
tx_day_order['DayDiff'] = (tx_day_order['InvoiceDay'] - tx_day_order['PrevInvoiceDate']).dt.days

tx_day_order['DayDiff2'] = (tx_day_order['InvoiceDay'] - tx_day_order['T2InvoiceDate']).dt.days

tx_day_order['DayDiff3'] = (tx_day_order['InvoiceDay'] - tx_day_order['T3InvoiceDate']).dt.days

In [45]:
tx_day_order.head()

Unnamed: 0,CustomerID,InvoiceDate,InvoiceDay,PrevInvoiceDate,T2InvoiceDate,T3InvoiceDate,DayDiff,DayDiff2,DayDiff3
649,12747.0,2011-03-01 14:53:00,2011-03-01,,,,,,
65091,12747.0,2011-05-05 15:31:00,2011-05-05,2011-03-01,,,65.0,,
90473,12747.0,2011-05-25 09:57:00,2011-05-25,2011-05-05,2011-03-01,,20.0,85.0,
124699,12747.0,2011-06-28 10:06:00,2011-06-28,2011-05-25,2011-05-05,2011-03-01,34.0,54.0,119.0
184410,12747.0,2011-08-22 10:38:00,2011-08-22,2011-06-28,2011-05-25,2011-05-05,55.0,89.0,109.0


In [46]:
tx_day_diff = tx_day_order.groupby('CustomerID').agg({'DayDiff': ['mean','std']}).reset_index()
tx_day_diff.columns = ['CustomerID', 'DayDiffMean','DayDiffStd']

In [47]:
tx_day_diff.head()

Unnamed: 0,CustomerID,DayDiffMean,DayDiffStd
0,12747.0,43.5,20.305993
1,12748.0,3.723404,3.083632
2,12749.0,25.0,30.099834
3,12821.0,,
4,12823.0,127.0,


In [48]:
## We only keep customers who have > 3 purchases
tx_day_order_last = tx_day_order.drop_duplicates(subset=['CustomerID'],keep='last')

In [50]:
tx_day_order_last.head()

Unnamed: 0,CustomerID,InvoiceDate,InvoiceDay,PrevInvoiceDate,T2InvoiceDate,T3InvoiceDate,DayDiff,DayDiff2,DayDiff3
184410,12747.0,2011-08-22 10:38:00,2011-08-22,2011-06-28,2011-05-25,2011-05-05,55.0,89.0,109.0
192527,12748.0,2011-08-30 12:00:00,2011-08-30,2011-08-25,2011-08-24,2011-08-17,5.0,6.0,13.0
181411,12749.0,2011-08-18 06:19:00,2011-08-18,2011-08-11,2011-08-01,2011-05-23,7.0,17.0,87.0
70339,12821.0,2011-05-09 15:51:00,2011-05-09,,,,,,
169417,12823.0,2011-08-04 17:45:00,2011-08-04,2011-03-30,,,127.0,,


In [51]:
tx_day_order_last = tx_day_order_last.dropna()

In [52]:
tx_day_order_last = pd.merge(tx_day_order_last, tx_day_diff, on='CustomerID')

In [53]:
tx_user = pd.merge(tx_user, tx_day_order_last[['CustomerID', 
                                               'DayDiff', 
                                               'DayDiff2', 
                                               'DayDiffMean', 
                                               'DayDiffStd']], on='CustomerID')

In [54]:
tx_user.sample(5)

Unnamed: 0,CustomerID,DayDiff,DayDiff2,DayDiffMean,DayDiffStd
461,16837.0,35.0,40.0,12.571429,10.612212
224,17611.0,6.0,38.0,25.666667,13.033291
484,14289.0,42.0,65.0,35.666667,10.969655
617,13870.0,33.0,63.0,22.333333,15.947832
558,14145.0,23.0,25.0,28.0,28.827071


In [55]:
## create tx_class as a copy 
tx_class = tx_user.copy()
tx_class = pd.get_dummies(tx_class)

In [56]:
tx_class.sample(5)

Unnamed: 0,CustomerID,DayDiff,DayDiff2,DayDiffMean,DayDiffStd
278,13969.0,20.0,34.0,22.857143,9.702724
575,18221.0,64.0,69.0,32.0,29.816103
223,14502.0,85.0,86.0,40.666667,42.193996
286,14194.0,1.0,8.0,13.333333,14.367472
404,17677.0,27.0,34.0,12.833333,11.784684
