In [1]:
import pandas as pd
import datetime as dt
import numpy as np

from lifetimes import BetaGeoFitter
from lifetimes import BetaGeoBetaBinomFitter
from lifetimes import ParetoNBDFitter

In [2]:
def get_parent_directory():
    list = sys.path[0].split('\\')[:-1]
    return_str = ''
    for element in list:
        return_str += element + '/'
    return return_str.rstrip('/')

In [2]:
obj = pd.read_pickle(get_parent_directory()+'/data/transaction_and_features.pkl')
cluster = pd.read_csv(get_parent_directory()+'/data/kmeans_clusters.csv')

obj['purch_date'] = pd.to_datetime(obj['purch_date']).dt.date
obj = obj[obj.is_purchase==1]

# obj = obj.merge(cluster, how='left', on='id')

In [3]:
test_months = 6
max_date = obj['purch_date'].max()
train = obj[obj['purch_date'] < max_date - pd.Timedelta(weeks=test_months * 4)]
test = obj[obj['purch_date'] >= max_date - pd.Timedelta(weeks=test_months * 4)]

## Prepare train set

In [6]:
train_agg = train.groupby(['id']).agg({'purch_date':['nunique', 'min', 'max']}).reset_index()
train_agg = train_agg.droplevel(0, axis=1) 
train_agg.columns = ['id', 'frequency', 'first_purch', 'last_purch']
train_agg['recency'] = (train_agg['last_purch'] - train_agg['first_purch']).dt.days
train_agg['T'] = (dt.date(2021, 7, 15) - train_agg['first_purch']).dt.days
train_agg = train_agg[['id', 'frequency', 'recency', 'T']]

train_agg = train_agg.merge(cluster, how='left', on='id')

In [7]:
train_agg.head()

Unnamed: 0,id,frequency,recency,T,label_kmeans_10
0,---XA7L5SsGM0hs7WKhOag,1,0,246,1
1,--16sRpjRAm2ByER7Vr7dw,1,0,95,7
2,--8eSKd-Tjq2_XwPxuSZoA,2,223,440,3
3,--AArGC6TB6ehz1u7KpFcA,17,519,919,3
4,--GwShaESDOI3t_wT4mmDQ,1,0,412,2


## Prepare test set

In [8]:
test_ext = pd.concat([test, train[train.id.isin(test.id.unique().tolist())]])

min_date_test = test_ext.groupby(['id']).purch_date.min().reset_index()
min_date_test.columns = ['id',   'min_date']
purch_before = test_ext[test_ext.purch_date<=dt.date(2021,7,15)].groupby(['id']).purch_date.nunique().reset_index()
test_ext = test_ext[['id',  'purch_date']].drop_duplicates()
test_ext.sort_values(by=['id',   'purch_date'], inplace=True) 
test_ext['prev_purch_date'] = test_ext.groupby(['id']).purch_date.shift()
test_ext = test_ext[test_ext.purch_date>dt.date(2021,7,15)]

test_ext = test_ext.merge(min_date_test, how='left', on=['id'])
test_ext = test_ext.merge(purch_before, how='left', on=['id'])
test_ext.columns = ['id','purch_date', 'prev_purch_date', 'min_date', 'purchases_in_train']
test_ext['purch'] = 1 
test_ext['cum_purch'] = test_ext.groupby(['id']).purch.cumsum()

test_ext['purchases_in_train'].fillna(0, inplace=True) 
test_ext['frequency'] = test_ext['purchases_in_train'] + test_ext['cum_purch']
test_ext['T'] = (test_ext.purch_date-test_ext.min_date).dt.days

test_ext['prev_purch_date'] = np.where(test_ext.prev_purch_date.isna(), test_ext.purch_date, test_ext.prev_purch_date)
test_ext['recency'] = (test_ext.purch_date-test_ext.prev_purch_date).dt.days
test_ext['next_purch_date'] = test_ext.groupby('id')['purch_date'].shift(-1)

test_ext = test_ext[['id', 'purch_date', 'next_purch_date', 'frequency', 'T', 'recency']]
test_ext = test_ext[~test_ext.next_purch_date.isna()]

In [9]:
test_agg = test_ext.merge(cluster, how='left', on='id')
test_agg.head() 

Unnamed: 0,id,purch_date,next_purch_date,frequency,T,recency,label_kmeans_10
0,--GwShaESDOI3t_wT4mmDQ,2021-11-04,2021-11-07,2.0,524,524,2
1,--MZvNm3Q6mYrohqHxwYsw,2021-09-04,2021-09-22,7.0,432,183,0
2,--MZvNm3Q6mYrohqHxwYsw,2021-09-22,2021-10-03,8.0,450,18,0
3,--MZvNm3Q6mYrohqHxwYsw,2021-10-03,2021-10-06,9.0,461,11,0
4,--MZvNm3Q6mYrohqHxwYsw,2021-10-06,2021-10-07,10.0,464,3,0


## Create model

In [None]:
# pnf = ParetoNBDFitter()
# pnf.fit(train_agg['frequency'], train_agg['recency'], train_agg['T'])

In [10]:
train_agg.groupby('label_kmeans_10').id.nunique().reset_index()

Unnamed: 0,label_kmeans_10,id
0,0,5494
1,1,15417
2,2,14102
3,3,16210
4,4,952
5,5,3310
6,6,47
7,7,15380
8,8,17883
9,9,17


In [21]:
pred_features = []

for i in np.arange(1, 211):
    pred_features.append('pred_purch_d'+'{}'.format(i))

### Cluster 0

In [22]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==0]

bgf_cluser_0 = BetaGeoFitter(penalizer_coef=0.001)
bgf_cluser_0.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_0 = test_agg[test_agg.label_kmeans_10==0]

for i in np.arange(1, 211):
    test_cluster_0['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_0.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_0['frequency'].values, 
                                                                                                                           test_cluster_0['recency'].values, 
                                                                                                                           test_cluster_0['T'].values))

  if sys.path[0] == "":


### Cluster 1
    

In [23]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==1]


bgf_cluser_1 = ParetoNBDFitter(penalizer_coef=0.001)
bgf_cluser_1.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  A_2 = logaddexp(-(r + x) * log(alpha + T) - s * log(beta + T), log(s) + log_A_0 - log(r_s_x))


<lifetimes.ParetoNBDFitter: fitted with 15417 subjects, alpha: 0.00, beta: 0.00, r: 0.89, s: 4.84>

In [24]:
test_cluster_1 = test_agg[test_agg.label_kmeans_10==1]

for i in np.arange(1, 211):
    test_cluster_1['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_1.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_1['frequency'].values, 
                                                                                                                           test_cluster_1['recency'].values, 
                                                                                                                           test_cluster_1['T'].values))

  import sys


### Cluster 2    

In [25]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==2]

bgf_cluser_2 = BetaGeoFitter(penalizer_coef=0.001)
bgf_cluser_2.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_2 = test_agg[test_agg.label_kmeans_10==2]

for i in np.arange(1, 211):
    test_cluster_2['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_2.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_2['frequency'].values, 
                                                                                                                           test_cluster_2['recency'].values, 
                                                                                                                           test_cluster_2['T'].values))

  if sys.path[0] == "":


### Cluster 3

In [26]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==3]

bgf_cluser_3 = BetaGeoFitter(penalizer_coef=0.001)
bgf_cluser_3.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_3 = test_agg[test_agg.label_kmeans_10==3]

for i in np.arange(1, 211):
    test_cluster_3['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_3.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_3['frequency'].values, 
                                                                                                                           test_cluster_3['recency'].values, 
                                                                                                                           test_cluster_3['T'].values))

  if sys.path[0] == "":


### Cluster 4,5,6,9

In [27]:
train_cluster_data = train_agg[train_agg.label_kmeans_10.isin([4,5,6,9])]

bgf_cluser_4 = ParetoNBDFitter(penalizer_coef=0.001)
bgf_cluser_4.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_4 = test_agg[test_agg.label_kmeans_10.isin([4,5,6,9])]

for i in np.arange(1, 211):
    test_cluster_4['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_4.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_4['frequency'].values, 
                                                                                                                           test_cluster_4['recency'].values, 
                                                                                                                           test_cluster_4['T'].values))

  if sys.path[0] == "":


### Cluster 7

In [28]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==7]

bgf_cluser_7 = ParetoNBDFitter(penalizer_coef=0.001)
bgf_cluser_7.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_7 = test_agg[test_agg.label_kmeans_10==7]

for i in np.arange(1, 211):
    test_cluster_7['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_7.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_7['frequency'].values, 
                                                                                                                           test_cluster_7['recency'].values, 
                                                                                                                           test_cluster_7['T'].values))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  A_2 = logaddexp(-(r + x) * log(alpha + T) - s * log(beta + T), log(s) + log_A_0 - log(r_s_x))
  if sys.path[0] == "":


### Cluster 8

In [29]:
train_cluster_data = train_agg[train_agg.label_kmeans_10==8]

bgf_cluser_8 = BetaGeoFitter(penalizer_coef=0.001)
bgf_cluser_8.fit(train_cluster_data['frequency'], train_cluster_data['recency'], train_cluster_data['T'])

test_cluster_8 = test_agg[test_agg.label_kmeans_10==8]

for i in np.arange(1, 211):
    test_cluster_8['pred_purch_d{}'.format(i)] = np.round(bgf_cluser_8.conditional_expected_number_of_purchases_up_to_time(i, 
                                                                                                                           test_cluster_8['frequency'].values, 
                                                                                                                           test_cluster_8['recency'].values, 
                                                                                                                           test_cluster_8['T'].values))

  if sys.path[0] == "":


## Evaluate error

In [30]:
test_cluster_data = pd.concat([test_cluster_0,test_cluster_1,test_cluster_2,test_cluster_3, test_cluster_4, test_cluster_7, test_cluster_8])

In [31]:
test_cluster_data_long = pd.melt(test_cluster_data, id_vars=['id', 'purch_date', 'next_purch_date'], value_vars=pred_features)
test_cluster_data_long['day'] = test_cluster_data_long.variable.str.replace('\D+', '')
test_cluster_data_long = test_cluster_data_long[test_cluster_data_long['value']==1].groupby(['id', 'purch_date', 'next_purch_date']).day.min().reset_index()


test_prediction = test_cluster_data[['id', 'purch_date', 'next_purch_date']].merge(test_cluster_data_long[['id', 'purch_date','day']], how='left', on=['id', 'purch_date'])
test_prediction = test_prediction[~test_prediction.next_purch_date.isna()]
test_prediction['day'] = np.where(test_prediction.day.isna(), 210, test_prediction.day)
test_prediction['next_date_pred'] = test_prediction['purch_date'] + pd.to_timedelta(pd.np.ceil(test_prediction['day'].astype(int)), unit="D") 
test_prediction['difr'] = abs((test_prediction['next_purch_date'] - test_prediction['next_date_pred']).dt.days)



In [32]:
test_prediction.difr.mean()

169.61778244367278