# Customer Life Value Optimization Model
Goal: Develop a model using A/B testing to strategise discount targeting for maximised Customer Life Value (CLV)

#### Importing Packages

In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

print("Packages imported successfully!")

Packages imported successfully!


#### Importing the data

In [64]:
# Removing index, and customer ID columns
# We use the encoded data, not normalized; we only want the features normal, not the output
data = pd.read_csv("data/data_encoded.csv").drop(columns=["Unnamed: 0", "Customer ID"])

data.head()

Unnamed: 0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
0,0,29,4,1,1120.2,14,4.6,1,25,1
1,1,34,2,2,780.5,11,4.1,0,18,0
2,0,43,0,0,510.75,9,3.4,1,42,2
3,1,30,5,1,1480.3,19,4.7,0,12,1
4,1,27,3,2,720.4,13,4.0,1,55,2


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    348 non-null    int64  
 1   Age                       348 non-null    int64  
 2   City                      348 non-null    int64  
 3   Membership Type           348 non-null    int64  
 4   Total Spend               348 non-null    float64
 5   Items Purchased           348 non-null    int64  
 6   Average Rating            348 non-null    float64
 7   Discount Applied          348 non-null    int64  
 8   Days Since Last Purchase  348 non-null    int64  
 9   Satisfaction Level        348 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 27.3 KB


### Part 1: Initial Model

In [78]:
# Define the Features and Target Variable
target = 'Total Spend'  # Target = Customer Lifetime Value (CLV)
features = data.columns.drop([target, 'Discount Applied'])

print("Features:",features)

Features: Index(['Gender', 'Age', 'City', 'Membership Type', 'Items Purchased',
       'Average Rating', 'Days Since Last Purchase', 'Satisfaction Level'],
      dtype='object')


Model Training with Lasso CV to select key features

In [67]:
X = data[features]
y = data[target]

Standardize the Features

In [80]:
categorical = ["Gender","City","Membership Type"]
numerical = X.columns.difference(categorical)
numerical

Index(['Age', 'Average Rating', 'Days Since Last Purchase', 'Items Purchased',
       'Satisfaction Level'],
      dtype='object')

In [81]:
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical] = scaler.fit_transform(X_scaled[numerical])
X_scaled = pd.DataFrame(X_scaled, columns = features)
X_scaled.head()

Unnamed: 0,Gender,Age,City,Membership Type,Items Purchased,Average Rating,Days Since Last Purchase,Satisfaction Level
0,0,-0.939761,4,1,0.330381,0.996757,-0.120022,-0.032324
1,1,0.08672,2,2,-0.394236,0.132172,-0.64026,-1.282192
2,0,1.934386,0,0,-0.877314,-1.078247,1.143413,1.217543
3,1,-0.734465,5,1,1.538076,1.169674,-1.086179,-0.032324
4,1,-1.350354,3,2,0.088842,-0.040745,2.10957,1.217543


In [82]:
lasso = LassoCV(cv=10, random_state=42)
lasso.fit(X_scaled, y)

In [83]:
selected_features = np.array(features)[lasso.coef_ != 0]
print("Original features:", features.values)
print("\nOriginal features:", selected_features)
difference = list(set(features) - set(selected_features))

print("\nRemoved Feature(s):",difference)

Original features: ['Gender' 'Age' 'City' 'Membership Type' 'Items Purchased'
 'Average Rating' 'Days Since Last Purchase' 'Satisfaction Level']

Original features: ['Gender' 'Age' 'City' 'Items Purchased' 'Average Rating'
 'Days Since Last Purchase' 'Satisfaction Level']

Removed Feature(s): ['Membership Type']


In [84]:
# Select the features that Lasso has chosen
X_selected = X_scaled.drop(columns = difference)
X_selected.columns

Index(['Gender', 'Age', 'City', 'Items Purchased', 'Average Rating',
       'Days Since Last Purchase', 'Satisfaction Level'],
      dtype='object')

## Part 2

Train Gradient Boosting Regressor on Selected Features

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)

Evaluating the GBR model

In [94]:
y_pred = gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Gradient Boosting RMSE: {rmse:.2f}')

Gradient Boosting RMSE: 16.13


### Part 3: Optimize Discount Allocation

In [96]:
discount_threshold = np.percentile(gbr.predict(X_selected), 75)  # Top 25% of predicted CLV
print(discount_threshold)

1160.3485685479022


In [97]:
# Assign discounts based on the predicted CLV
data['Predicted_CLV'] = gbr.predict(X_selected)
data['Discount_Target'] = np.where(data['Predicted_CLV'] >= discount_threshold, 1, 0)

data.head()

Unnamed: 0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level,Predicted_CLV,Discount_Target
0,0,29,4,1,1120.2,14,4.6,1,25,1,1120.497386,0
1,1,34,2,2,780.5,11,4.1,0,18,0,781.303273,0
2,0,43,0,0,510.75,9,3.4,1,42,2,511.018736,0
3,1,30,5,1,1480.3,19,4.7,0,12,1,1444.125908,1
4,1,27,3,2,720.4,13,4.0,1,55,2,720.358572,0


: 