In [1]:
import numpy as np
import pandas as pd
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from sklearn.metrics import precision_score, recall_score, f1_score, auc, roc_auc_score, accuracy_score, classification_report, confusion_matrix, roc_curve
from xgboost import plot_importance



In [2]:
df_response = pd.read_csv('Retail_Data_Response.csv')
df_transactions = pd.read_csv('Retail_Data_Transactions.csv', parse_dates=['trans_date'])

In [3]:
## since the last date of the data is 16 March 2015, the campaign date is assumed to be 17 March 2015
## RFM model will be used to predict campaign response. Recency is calculated

campaign_date = dt.datetime(2015,3,17)
df_transactions['recent']= campaign_date - df_transactions['trans_date']
df_transactions['recent'].astype('timedelta64[D]')
df_transactions['recent']=df_transactions['recent'] / np.timedelta64(1, 'D')
df_transactions.head()

Unnamed: 0,customer_id,trans_date,tran_amount,recent
0,CS5295,2013-02-11,35,764.0
1,CS4768,2015-03-15,39,2.0
2,CS2122,2013-02-26,52,749.0
3,CS1217,2011-11-16,99,1217.0
4,CS1850,2013-11-20,78,482.0


In [58]:
df_clv = df_transactions.groupby('customer_id').agg({'recent': lambda x:x.min(),                      # Recency
                                                     'customer_id': lambda x: len(x),                 # Frequency
                                                     'tran_amount': lambda x: x.sum(),                # Monetary Value           
                                                     'trans_date': lambda x: (x.max() - x.min()).days}) #AOU

df_clv.rename(columns={'recent': 'recency', 
                       'customer_id': 'frequency', 
                       'tran_amount': 'monetary_value',
                       'trans_date' : 'AOU'}, inplace=True)

df_clv['ticket_size'] = df_clv['monetary_value'] / df_clv['frequency']
df_clv['amount_sd'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.std(x)})
df_clv['f1'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.product(x)})
df_clv['f2'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.median(x)})
df_clv['f3'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.percentile(x,75)})
df_clv['f4'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.percentile(x,25)})
df_clv['f5'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.percentile(x,10)})
df_clv['f6'] = df_transactions.groupby('customer_id').agg({'tran_amount': lambda x:np.percentile(x,90)})

df_clv['f7'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.product(x)})
df_clv['f8'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.median(x)})
df_clv['f9'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.percentile(x,75)})
df_clv['f10'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.percentile(x,25)})
df_clv['f11'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.percentile(x,10)})
df_clv['f12'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.percentile(x,90)})
df_clv['f13'] = df_transactions.groupby('customer_id').agg({'recent': lambda x:np.std(x)})

df_clv = df_clv.reset_index()
df_clv.head(3)

Unnamed: 0,customer_id,recency,frequency,monetary_value,AOU,ticket_size,amount_sd,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
0,CS1112,62.0,15,1012,1309,67.466667,19.095782,534751083403149312,63.0,78.5,56.0,44.2,93.6,2.0407499999999998e+41,822.0,1034.5,405.5,159.4,1288.4,418.268781
1,CS1113,36.0,20,1490,1354,74.5,20.715936,-8074518044941680640,79.5,94.25,56.0,43.6,97.0,1.3829860000000001e+53,666.5,898.25,322.5,59.4,1250.0,407.274781
2,CS1114,33.0,19,1432,1309,75.368421,20.772478,-6178413198683725824,79.0,94.5,55.5,45.4,97.0,3.4059310000000003e+50,552.0,1156.5,302.0,91.2,1302.4,459.063288


In [59]:
df_modeling_clv = pd.merge(df_response,df_clv)
df_modeling_clv.head()

X_clv = df_modeling_clv.drop(columns=['response','customer_id'])
y_clv = df_modeling_clv['response']

sm = SMOTE(random_state=0)
X_train_clv, X_test_clv, y_train_clv, y_test_clv = train_test_split(X_clv, y_clv, test_size=0.3, random_state=0)
sm.fit(X_train_clv, y_train_clv)
X_SMOTE_clv, y_SMOTE_clv = sm.fit_sample(X_train_clv, y_train_clv)

print('XGBoost model - SMOTE CLV')

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc',
 learning_rate =0.005,
 n_estimators=200,
 max_depth=4,
 gamma=0.0,
 colsample_bytree=0.6)

predicted_y = []
expected_y = []

xgb_model_SMOTE_clv = xgb_model.fit(X_SMOTE_clv, y_SMOTE_clv, early_stopping_rounds=500, eval_set=[(X_test_clv.to_numpy(), y_test_clv)], verbose = False)
predictions =  xgb_model_SMOTE_clv.predict(X_SMOTE_clv)
predicted_y.extend(predictions)
expected_y.extend(y_SMOTE_clv)
report_train = classification_report(expected_y, predicted_y)
print('training set')
print(report_train) 

predicted_y = []
expected_y = []
predictions = xgb_model_SMOTE_clv.predict(X_test_clv.to_numpy())
predicted_y.extend(predictions)
expected_y.extend(y_test_clv)
report_test = classification_report(expected_y, predicted_y)
print('test set')
print(report_test)

y_score_train = xgb_model_SMOTE_clv.predict_proba(X_SMOTE_clv)
auc_train = roc_auc_score(y_SMOTE_clv, y_score_train[:,1])

y_score_test = xgb_model_SMOTE_clv.predict_proba(X_test_clv.to_numpy())
auc_test = roc_auc_score(y_test_clv, y_score_test[:,1])

print('auc_train: ',auc_train)
print('auc_test: ',auc_test)



XGBoost model - SMOTE CLV
training set
              precision    recall  f1-score   support

           0       0.86      0.72      0.78      4389
           1       0.76      0.88      0.82      4389

    accuracy                           0.80      8778
   macro avg       0.81      0.80      0.80      8778
weighted avg       0.81      0.80      0.80      8778

test set
              precision    recall  f1-score   support

           0       0.95      0.73      0.82      1848
           1       0.22      0.66      0.33       218

    accuracy                           0.72      2066
   macro avg       0.58      0.69      0.58      2066
weighted avg       0.87      0.72      0.77      2066

auc_train:  0.8774020585547009
auc_test:  0.7679862186742921
