In [426]:
import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import classification_report,f1_score, PrecisionRecallDisplay, precision_score, recall_score, roc_auc_score, RocCurveDisplay, roc_curve, confusion_matrix

from sklearn.linear_model import LogisticRegression

Scenario: EcomX Retailers is a mid-sized online retail company specializing in electronics, clothing, and home goods. The company has been growing steadily over the past few years and wants to optimize its marketing and customer retention strategies. One key aspect of this strategy is to understand the Customer Lifetime Value (CLV) of its customers. By predicting the CLV, EcomX aims to identify high-value customers and tailor personalized marketing efforts to retain them, while also identifying low-value customers to manage resources effectively.

Objective: The objective of this assignment is to build and evaluate a machine learning model to predict the Customer Lifetime Value (CLV) of EcomX’s customers. You will need to identify and apply various data cleaning and preparation techniques, as well as select an appropriate model and evaluation criteria.

In [427]:
customer = pd.read_csv('C:/Users/User/Desktop/customers_final.csv')
engagement = pd.read_csv('C:/Users/User/Desktop/engagements_final.csv')
marketing = pd.read_csv('C:/Users/User/Desktop/marketing_final.csv')
transactions = pd.read_csv('C:/Users/User/Desktop/transactions_final.csv')

Data Cleaning (Missing Values)

In [428]:
customer.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury
1,2,2021-09-08,2023-10-25,,Male,Hillville
2,3,2021-06-01,2022-11-27,,,North Latoyatown
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad
4,5,2022-01-24,2023-06-02,,Male,East Matthewfort


In [429]:
customer

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury
1,2,2021-09-08,2023-10-25,,Male,Hillville
2,3,2021-06-01,2022-11-27,,,North Latoyatown
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad
4,5,2022-01-24,2023-06-02,,Male,East Matthewfort
...,...,...,...,...,...,...
9995,9996,2022-12-16,2023-08-13,42.0,Female,Johnstonborough
9996,9997,2022-07-09,2023-01-25,26.0,Male,Jessicamouth
9997,9998,2023-09-17,2024-01-30,39.0,Male,New John
9998,9999,2022-05-10,2022-07-15,31.0,Female,Andrewland


In [430]:
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         10000 non-null  int64  
 1   join_date           10000 non-null  object 
 2   last_purchase_date  10000 non-null  object 
 3   age                 8991 non-null   float64
 4   gender              9467 non-null   object 
 5   location            10000 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 468.9+ KB


In [431]:
customer.describe()

Unnamed: 0,customer_id,age
count,10000.0,8991.0
mean,5000.5,43.467467
std,2886.89568,15.09438
min,1.0,18.0
25%,2500.75,30.0
50%,5000.5,44.0
75%,7500.25,57.0
max,10000.0,69.0


In [432]:
#fill out missing value to customer's age column with the age mean
customer['age']= customer['age'].fillna(customer['age'].mean())

In [433]:
#change the data type from float to int
customer['age']= customer['age'].astype(int)

In [434]:
#check if missing values are gone
customer.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location
0,1,2023-11-20,2024-03-17,56,Female,North Shannonbury
1,2,2021-09-08,2023-10-25,43,Male,Hillville
2,3,2021-06-01,2022-11-27,43,,North Latoyatown
3,4,2022-01-01,2022-09-01,29,Male,Grossstad
4,5,2022-01-24,2023-06-02,43,Male,East Matthewfort


In [435]:
#fill out missing data under gender column
#N/A is defined as Other
def gender_type(x):
    if x == 'Female':
        return 'Female'
    elif x == 'Male':
        return 'Male'
    else:
        return 'Other'

customer['gender'] = customer['gender'].apply(gender_type)

In [436]:
#check if all missing data are gone
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         10000 non-null  int64 
 1   join_date           10000 non-null  object
 2   last_purchase_date  10000 non-null  object
 3   age                 10000 non-null  int32 
 4   gender              10000 non-null  object
 5   location            10000 non-null  object
dtypes: int32(1), int64(1), object(4)
memory usage: 429.8+ KB


In [437]:
engagement.head()

Unnamed: 0,customer_id,number_of_site_visits,number_of_emails_opened,number_of_clicks
0,1,10,15,1
1,2,285,49,51
2,3,192,73,25
3,4,110,30,17
4,5,161,2,7


In [438]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   customer_id              10000 non-null  int64
 1   number_of_site_visits    10000 non-null  int64
 2   number_of_emails_opened  10000 non-null  int64
 3   number_of_clicks         10000 non-null  int64
dtypes: int64(4)
memory usage: 312.6 KB


In [439]:
engagement.describe()

Unnamed: 0,customer_id,number_of_site_visits,number_of_emails_opened,number_of_clicks
count,10000.0,10000.0,10000.0,10000.0
mean,5000.5,100.1119,39.8621,19.7253
std,2886.89568,118.625052,46.511719,22.975083
min,1.0,1.0,1.0,1.0
25%,2500.75,15.0,6.0,3.0
50%,5000.5,53.0,22.0,11.0
75%,7500.25,141.0,57.0,28.0
max,10000.0,711.0,303.0,142.0


In [440]:
#investigat if there exist outlier
engagement[engagement['number_of_site_visits'] == 711]

Unnamed: 0,customer_id,number_of_site_visits,number_of_emails_opened,number_of_clicks
4052,4053,711,17,1


In [441]:
marketing.head()

Unnamed: 0,campaign_id,customer_id,response,promotion_type,campaign_date
0,1,1,No,Buy One Get One,2024-02-29
1,2,1,No,Discount,2024-01-24
2,3,1,No,Free Shipping,2024-03-05
3,4,1,Yes,Buy One Get One,2024-01-10
4,5,2,Yes,Free Shipping,2022-07-08


In [442]:
marketing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25200 entries, 0 to 25199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   campaign_id     25200 non-null  int64 
 1   customer_id     25200 non-null  int64 
 2   response        25200 non-null  object
 3   promotion_type  25200 non-null  object
 4   campaign_date   25200 non-null  object
dtypes: int64(2), object(3)
memory usage: 984.5+ KB


In [443]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129545 entries, 0 to 129544
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   transaction_id      129545 non-null  int64  
 1   customer_id         129545 non-null  int64  
 2   transaction_date    129545 non-null  object 
 3   transaction_amount  129545 non-null  float64
 4   product_category    129545 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.9+ MB


In [None]:
customer['join_date'] = pd.to_datetime(customer['join_date'])
customer['last_purchase_date'] = pd.to_datetime(customer['last_purchase_date'])
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
marketing['campaign_date'] = pd.to_datetime(marketing['campaign_date'])

Join Tables

In [444]:
# split at customer level with # of marketing campaigns customer responded YES and No
# step 1: marketing data at a customer level

marketing_agg_yes = marketing[marketing['response']=='Yes'].groupby('customer_id')['campaign_id'].count().to_frame()

In [445]:
# step 2: aggregate transaction data at a customer level
transactions_agg = transactions.groupby('customer_id').aggregate({'transaction_id':'count','transaction_amount':'sum'})

In [446]:
# step 3: set customers and engagement index as customer_id
customer.set_index('customer_id', inplace=True)
engagement.set_index('customer_id', inplace=True)

In [450]:
# step 4: join all tables
joint_data= customer.join(engagement).join(transactions_agg).join(marketing_agg_yes)

In [451]:
# now have a customer level data set;  you will see some NaN values under campaign_id because some customers haven't been targeted with any marketing campaigns
joint_data

Unnamed: 0_level_0,join_date,last_purchase_date,age,gender,location,number_of_site_visits,number_of_emails_opened,number_of_clicks,transaction_id,transaction_amount,campaign_id
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2023-11-20,2024-03-17,56,Female,North Shannonbury,10,15,1,6,3509.48,1.0
2,2021-09-08,2023-10-25,43,Male,Hillville,285,49,51,9,6081.32,2.0
3,2021-06-01,2022-11-27,43,Other,North Latoyatown,192,73,25,6,1454.87,1.0
4,2022-01-01,2022-09-01,29,Male,Grossstad,110,30,17,20,7874.68,1.0
5,2022-01-24,2023-06-02,43,Male,East Matthewfort,161,2,7,24,15524.55,
...,...,...,...,...,...,...,...,...,...,...,...
9996,2022-12-16,2023-08-13,42,Female,Johnstonborough,119,47,16,10,5498.20,
9997,2022-07-09,2023-01-25,26,Male,Jessicamouth,3,33,14,12,5848.30,1.0
9998,2023-09-17,2024-01-30,39,Male,New John,53,17,5,3,3503.13,1.0
9999,2022-05-10,2022-07-15,31,Female,Andrewland,23,5,4,12,6721.86,


In [None]:
joint_data_yes.info()

In [None]:
#set missing campaign id as 0 to mark the customer who are not target for campaign
joint_data_yes['campaign_id']= joint_data_yes['campaign_id'].fillna(0)
joint_data_yes

In [None]:
joint_data_yes.info()

Feature engineering

In [None]:
sns.histplot(joint_data_yes['transaction_amount'])

In [None]:
sns.histplot(joint_data_yes['campaign_id'])

In [None]:
#create a new column named 'average_purchase_value'
joint_data_yes['avg_purchase_per_transaction'] = joint_data_yes['transaction_amount']/joint_data_yes['transaction_id']
joint_data_yes

In [None]:
#create column 'customer_lifespan' in months
joint_data_yes['customer_lifespan'] = ((joint_data_yes['last_purchase_date'] - joint_data_yes['join_date']).dt.days / 365).round(2)

In [None]:
sns.histplot(joint_data_yes['customer_lifespan'])

In [None]:
from sklearn.preprocessing import StandardScaler,LabelEncoder

#label encode categorical data
le = LabelEncoder()
joint_data_yes['gender']=le.fit_transform(joint_data_yes['gender'])
joint_data_yes['location']=le.fit_transform(joint_data_yes['location'])
joint_data_yes

In [None]:
#check if there is zero value in customer_lifespan
joint_data_yes[joint_data_yes['customer_lifespan'] == 0]

In [None]:
df1 = joint_data_yes[joint_data_yes['customer_lifespan'] != 0]

In [None]:
df1

In [None]:
df1.describe()

In [None]:
#Customer Lifetime Value = (transaction sum / Customer Lifespan)
#creating lifetime value per year per customer
df1['customer_lifetime_value'] = (df1['transaction_amount']/df1['customer_lifespan']).round(2)

In [None]:
df1

Creating FRM to classify customer value

In [None]:
import datetime as dt

# Convert transaction_date to datetime
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

# Calculate Recency, Frequency, and Monetary
snapshot_date = transactions['transaction_date'].max() + pd.DateOffset(1)

# Recency: Days since last transaction
rfm_recency = customer['last_purchase_date'].reset_index()
rfm_recency['recency'] = (snapshot_date - rfm_recency['last_purchase_date']).dt.days
rfm_recency

In [None]:
# Frequency: Number of transactions
rfm_frequency = transactions.groupby('customer_id')['transaction_id'].count().reset_index()
rfm_frequency.columns = ['customer_id', 'frequency']

In [None]:
rfm_monetary 

In [None]:
# Monetary: Total amount spent
rfm_monetary = transactions.groupby('customer_id')['transaction_amount'].sum().reset_index()
rfm_monetary.columns = ['customer_id', 'monetary']

# Merge RFM metrics into a single DataFrame
rfm = rfm_recency.merge(rfm_frequency, on='customer_id').merge(rfm_monetary, on='customer_id')

rfm.head()


In [None]:
# Define quantile thresholds for Recency, Frequency, and Monetary
quantiles = rfm[['recency', 'frequency', 'monetary']].quantile([0.25, 0.75]).to_dict()

# Define a function to assign scores based on quantiles
def rfm_score(x,a,b):
    if x <= a[b][0.25]:
        return 1
    elif x <= a[b][0.75]:
        return 2
    else:
        return 3

# Calculate R, F, M scores
rfm['R'] = rfm['recency'].apply(rfm_score, args=(quantiles, 'recency'))
rfm['F'] = rfm['frequency'].apply(rfm_score, args=(quantiles, 'frequency'))
rfm['M'] = rfm['monetary'].apply(rfm_score, args=(quantiles, 'monetary'))

# Calculate RFM score
rfm['RFM_Score'] = rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str)

# Classify customers into High, Medium, Low value categories
def classify_customer(rfm_score):
    if rfm_score in ['333', '332', '323', '322', '233', '232']:
        return 'High'
    elif rfm_score in ['311', '321', '312', '231', '221', '213']:
        return 'Medium'
    else:
        return 'Low'

rfm['Customer_Value'] = rfm['RFM_Score'].apply(classify_customer)


rfm.head()


In [None]:
df1

In [None]:
rfm.set_index('customer_id', inplace=True)
df1.set_index('customer_id', inplace=True)

In [None]:
df2 = df1.join(rfm['Customer_Value'])

In [None]:
# Compute the correlation matrix
correlation_matrix = df1.drop(columns=['join_date','last_purchase_date','avg_purchase_per_transaction','customer_lifespan','customer_lifetime_value']).corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

Modeling - using binary output

In [None]:
sns.histplot(df1['customer_lifetime_value'])

In [None]:
#as the customer staying longer with the company, they have the chance to have higher transactions
#so I classify the customer_lifetime based on the mean()
df1['customer_lifetime_value'].median()

In [None]:
df1['customer_lifetime_value'].mean()

In [None]:
#take the mean as proxy to split out put to 1 and 0
df1['binary_output'] = df1['customer_lifetime_value'].apply(lambda x: 1 if x>=8212 else 0)

In [None]:
df1.groupby('binary_output')['customer_lifetime_value'].count()/len(df1)

In [None]:
df1

In [None]:
sns.boxplot(x='binary_output',y='number_of_clicks',data=df1)

In [None]:
sns.boxplot(x='binary_output',y='transaction_amount',data=df1)

In [None]:
X = df1[['age','gender','location','number_of_site_visits','number_of_emails_opened','number_of_clicks','campaign_id',]]

y = df1['binary_output']

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, PrecisionRecallDisplay, precision_score, recall_score, roc_auc_score, RocCurveDisplay, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=1234)

In [None]:
#cross validation - divide dataset to 5 sections, train 4 sections and test 1 section, repeat the process 5 times
#quickly create 5 models and pick the best one

knn = KNeighborsClassifier()
log = LogisticRegression()


scores_log = cross_val_score(log, X_train, y_train, scoring='f1', cv=5)
scores_knn = cross_val_score(knn, X_train, y_train, scoring='f1', cv=5)


In [None]:
print(f'Cross-Validated Scores for Logistic Regression:{scores_log}')
print(f'Cross-Validated Scores for K Nearest Neighbor:{scores_knn}')

In [None]:
#calculate average score
print(np.mean(scores_log))
print(np.mean(scores_knn))

In [None]:
#building an actual model
logreg = log.fit(X_train, y_train)
KNN = knn.fit(X_train, y_train)

In [None]:
y_pred_log=logreg.predict(X_test)
y_pred_knn=KNN.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

#logreg score
print(accuracy_score(y_test,y_pred_log))
print(precision_score(y_test,y_pred_log))
print(f1_score(y_test,y_pred_log))


In [None]:
#knn score
print(accuracy_score(y_test,y_pred_knn))
print(precision_score(y_test,y_pred_knn))
print(f1_score(y_test,y_pred_knn))

In [None]:
# Import necessary libraries
from sklearn.metrics import roc_curve, auc, precision_recall_curve, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Plot ROC and Precision-Recall curves for Logistic Regression
probs_log = logreg.predict_proba(X_test)
fpr_log, tpr_log, _ = roc_curve(y_test, probs_log[:, 1])
precision_log, recall_log, _ = precision_recall_curve(y_test, probs_log[:, 1])
roc_auc_log = roc_auc_score(y_test, probs_log[:, 1])
pr_auc_log = auc(recall_log, precision_log)

plt.figure(figsize=(10, 5))
RocCurveDisplay(fpr=fpr_log, tpr=tpr_log, roc_auc=roc_auc_log, estimator_name='Logistic Regression').plot()
plt.title('ROC Curve - Logistic Regression')
plt.show()

plt.figure(figsize=(10, 5))
PrecisionRecallDisplay(precision=precision_log, recall=recall_log, average_precision=pr_auc_log, estimator_name='Logistic Regression').plot()
plt.title('Precision-Recall Curve - Logistic Regression')
plt.show()


In [None]:
# Plot ROC and Precision-Recall curves for KNN
probs_knn = KNN.predict_proba(X_test)
fpr_knn, tpr_knn, _ = roc_curve(y_test, probs_knn[:, 1])
precision_knn, recall_knn, _ = precision_recall_curve(y_test, probs_knn[:, 1])
roc_auc_knn = roc_auc_score(y_test, probs_knn[:, 1])
pr_auc_knn = auc(recall_knn, precision_knn)

plt.figure(figsize=(10, 5))
RocCurveDisplay(fpr=fpr_knn, tpr=tpr_knn, roc_auc=roc_auc_knn, estimator_name='KNN').plot()
plt.title('ROC Curve - KNN')
plt.show()

plt.figure(figsize=(10, 5))
PrecisionRecallDisplay(precision=precision_knn, recall=recall_knn, average_precision=pr_auc_knn, estimator_name='KNN').plot()
plt.title('Precision-Recall Curve - KNN')
plt.show()

By compare the two models between Logistic Regression and KNN, the Logistic Regression Model perform better

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Logistic Regression hyperparameter tuning
lr = LogisticRegression()
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='roc_auc')
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_

# KNN hyperparameter tuning
knn = KNeighborsClassifier()
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}
grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='roc_auc')
grid_knn.fit(X_train, y_train)
best_knn = grid_knn.best_estimator_

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

# Evaluate Logistic Regression
y_pred_lr = best_lr.predict(X_test)
y_prob_lr = best_lr.predict_proba(X_test)[:, 1]

accuracy_lr = accuracy_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_prob_lr)

# Evaluate KNN
y_pred_knn = best_knn.predict(X_test)
y_prob_knn = best_knn.predict_proba(X_test)[:, 1]

accuracy_knn = accuracy_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
roc_auc_knn = roc_auc_score(y_test, y_prob_knn)

# Cross-validation scores
cv_scores_lr = cross_val_score(best_lr, X_scaled, y, cv=5, scoring='roc_auc')
cv_scores_knn = cross_val_score(best_knn, X_scaled, y, cv=5, scoring='roc_auc')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, confusion_matrix, ConfusionMatrixDisplay

# ROC Curve for Logistic Regression
RocCurveDisplay.from_estimator(best_lr, X_test, y_test)
plt.title('ROC Curve - Logistic Regression')
plt.show()

# Precision-Recall Curve for Logistic Regression
PrecisionRecallDisplay.from_estimator(best_lr, X_test, y_test)
plt.title('Precision-Recall Curve - Logistic Regression')
plt.show()

# Confusion Matrix for Logistic Regression
ConfusionMatrixDisplay.from_estimator(best_lr, X_test, y_test)
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

# ROC Curve for KNN
RocCurveDisplay.from_estimator(best_knn, X_test, y_test)
plt.title('ROC Curve - KNN')
plt.show()

# Precision-Recall Curve for KNN
PrecisionRecallDisplay.from_estimator(best_knn, X_test, y_test)
plt.title('Precision-Recall Curve - KNN')
plt.show()

# Confusion Matrix for KNN
ConfusionMatrixDisplay.from_estimator(best_knn, X_test, y_test)
plt.title('Confusion Matrix - KNN')
plt.show()
