# Customer Life Value Optimization Model
Goal: Develop a model using A/B testing to strategise discount targeting for maximised Customer Life Value (CLV)

#### Importing Packages

In [66]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

print("Packages imported successfully!")

Packages imported successfully!


#### Importing the data

In [67]:
# Removing index, and customer ID columns
# We use the encoded data, not normalized; we only want the features normal, not the output
data = pd.read_csv("data/data_encoded.csv").drop(columns=["Unnamed: 0", "Customer ID"])

data.head()

Unnamed: 0,Gender,Age,City,Membership Type,Total Spend,Items Purchased,Average Rating,Discount Applied,Days Since Last Purchase,Satisfaction Level
0,0,29,4,1,1120.2,14,4.6,1,25,1
1,1,34,2,2,780.5,11,4.1,0,18,0
2,0,43,0,0,510.75,9,3.4,1,42,2
3,1,30,5,1,1480.3,19,4.7,0,12,1
4,1,27,3,2,720.4,13,4.0,1,55,2


In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Gender                    348 non-null    int64  
 1   Age                       348 non-null    int64  
 2   City                      348 non-null    int64  
 3   Membership Type           348 non-null    int64  
 4   Total Spend               348 non-null    float64
 5   Items Purchased           348 non-null    int64  
 6   Average Rating            348 non-null    float64
 7   Discount Applied          348 non-null    int64  
 8   Days Since Last Purchase  348 non-null    int64  
 9   Satisfaction Level        348 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 27.3 KB


## Feature Engineering

#### Creating Interaction Terms

Before continuing, there are vital insights from the ETL script that we must acknowledge prior to modelling:
- Customers' Genders are (mostly) segregated by City
- Whether or not a customer receieved a Discount is entirely based on City

Because of this, *City* must be treated as a *confounding variable*

In [69]:
data = pd.read_csv("data/data_raw.csv").drop(columns=[ "Customer ID"])
# Creating interaction terms between interaction features
cat_vars = ['City', 'Gender', 'Discount Applied', 'Membership Type', 'Satisfaction Level']

# Apply OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cats = encoder.fit_transform(data[cat_vars])
encoded_cat_names = encoder.get_feature_names_out(cat_vars)
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_cat_names) # Dropping a col that has 0 vals
encoded_df.head()

Unnamed: 0,City_Houston,City_Los Angeles,City_Miami,City_New York,City_San Francisco,Gender_Male,Discount Applied_True,Membership Type_Gold,Membership Type_Silver,Satisfaction Level_Satisfied,Satisfaction Level_Unsatisfied,Satisfaction Level_nan
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [70]:
data_enc = pd.concat([data, encoded_df], axis=1).drop(columns=cat_vars) # Dropping the non-encoded columns

interaction_features = encoded_cat_names.tolist() + ['Average Rating', 'Age']
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
interaction_terms = poly.fit_transform(data_enc[interaction_features])

# Convert interaction terms to DataFrame and add to the main data
interaction_term_names = poly.get_feature_names_out(interaction_features)
interaction_df = pd.DataFrame(interaction_terms, columns=interaction_term_names)

data1 = pd.concat([data_enc, interaction_df], axis=1)
data1.head()

Unnamed: 0,Age,Total Spend,Items Purchased,Average Rating,Days Since Last Purchase,City_Houston,City_Los Angeles,City_Miami,City_New York,City_San Francisco,...,Satisfaction Level_Satisfied Satisfaction Level_Unsatisfied,Satisfaction Level_Satisfied Satisfaction Level_nan,Satisfaction Level_Satisfied Average Rating,Satisfaction Level_Satisfied Age,Satisfaction Level_Unsatisfied Satisfaction Level_nan,Satisfaction Level_Unsatisfied Average Rating,Satisfaction Level_Unsatisfied Age,Satisfaction Level_nan Average Rating,Satisfaction Level_nan Age,Average Rating Age
0,29,1120.2,14,4.6,25,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,4.6,29.0,0.0,0.0,0.0,0.0,0.0,133.4
1,34,780.5,11,4.1,18,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.4
2,43,510.75,9,3.4,42,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.4,43.0,0.0,0.0,146.2
3,30,1480.3,19,4.7,12,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,4.7,30.0,0.0,0.0,0.0,0.0,0.0,141.0
4,27,720.4,13,4.0,55,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,27.0,0.0,0.0,108.0


#### Feature Scaling

In [76]:
scaler = StandardScaler()
data_scaled = data1.copy()
numerical_features = ["Age", "Items Purchased", "Days Since Last Purchase"]
data_scaled[numerical_features] = scaler.fit_transform(data_scaled[numerical_features])
data_scaled[numerical_features].head()

Unnamed: 0,Age,Age.1,Items Purchased,Days Since Last Purchase
0,-0.945152,-0.945152,0.337346,-0.118359
1,0.082826,0.082826,-0.385538,-0.639907
2,1.933185,1.933185,-0.867461,1.148256
3,-0.739557,-0.739557,1.542153,-1.086947
4,-1.356343,-1.356343,0.096385,2.116844


#### X, y Separation

In [77]:
X = data_scaled.drop(columns=['Total Spend'])  # Assuming 'Total Spend' is the target variable
y = data_scaled['Total Spend']

#### Feature Selection

In [79]:
# Initialize the Lasso model with a chosen alpha value
lasso = Lasso(alpha=0.01, random_state=42)

# Fit the Lasso model to the data
lasso.fit(X, y)

# Identify the features with non-zero coefficients
selected_features = X.columns[(lasso.coef_ != 0)]
print("Selected Features:", selected_features)

# If you want to see the coefficients
print("Lasso Coefficients:", lasso.coef_)

X_selected = X[selected_features]

Selected Features: Index(['Age', 'Items Purchased', 'Average Rating', 'Days Since Last Purchase',
       'City_Houston', 'City_Miami', 'City_New York', 'City_San Francisco',
       'Gender_Male', 'Membership Type_Gold', 'Satisfaction Level_Unsatisfied',
       'City_Miami', 'Gender_Male', 'Average Rating', 'Age',
       'City_Houston Average Rating', 'City_Houston Age',
       'City_Los Angeles Satisfaction Level_Satisfied',
       'City_Los Angeles Average Rating', 'City_Los Angeles Age',
       'City_Miami Gender_Male', 'City_Miami Average Rating', 'City_Miami Age',
       'City_New York Discount Applied_True',
       'City_New York Membership Type_Gold', 'City_New York Average Rating',
       'City_New York Age', 'City_San Francisco Gender_Male',
       'City_San Francisco Membership Type_Gold',
       'City_San Francisco Satisfaction Level_Satisfied',
       'City_San Francisco Average Rating', 'City_San Francisco Age',
       'Gender_Male Discount Applied_True',
       'Gender_Mal

#### Train-Test Split

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

There are two major things to note here:

- **City**: It seems as though their Discount program was targeted by City, not by customer. We will have to keep this in mind when constructing models. Further, it does not seem as though, based on City alone, there was a strong effect of applying a discount; however, we cannot compare as we do not have independent data points.
- **Gender**: Discounts were much more heavily applied to Female customers as compared to Male.

Let's investigate how Discount Applied stacks up against both Gender and City together.

In [63]:
models = {
    LinearRegression(): "Linear Regression",
    RandomForestRegressor(random_state=42): "Random Forest Regression",
    GradientBoostingRegressor(random_state=42): "Gradient Boosting Regressor"
}

In [64]:
for model, name in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print model performance
    print(f"Model: {name}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}\n")


ValueError: could not convert string to float: 'Male'

In [None]:
clv_model = RandomForestRegressor(random_state=42)
clv_model.fit(X_train, y_train)
print("CLV Prediction Model Trained")

In [None]:
data['Predicted_CLV_Baseline'] = clv_model.predict(X)
data[["Total Spend","Predicted_CLV_Baseline"]]

#### Creating interaction terms
Using the ETL script, we will create interaction terms based on highly correlated variables

In [None]:
data['Discount_Items_Interaction'] = data['Discount Applied'] * data['Items Purchased']
data['Discount_AverageRating_Interaction'] = data['Discount Applied'] * data['Average Rating']
data['Discount_Satisfaction_Interaction'] = data['Discount Applied'] * data['Satisfaction Level']
data['Discount_DaysSince_Interaction'] = data['Discount Applied'] * data['Days Since Last Purchase']
data['Membership_Age_Discount_Interaction'] = data['Membership Type'] * data['Age'] * data['Discount Applied']

In [None]:
# Step 3: Split Data into Treatment and Control Groups
treatment = data[data['Discount Applied'] == 1]
control = data[data['Discount Applied'] == 0]

In [None]:
categorical_variables = ['Gender', 'City', 'Membership Type']

for var in categorical_variables:
    # Control Group
    control_dist = control[var].value_counts(normalize=True)
    
    # Treatment Group
    treatment_dist = treatment[var].value_counts(normalize=True)
    
    # Create a 1x2 grid of pie charts
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Control Group Pie Chart
    axes[0].pie(control_dist, labels=control_dist.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
    axes[0].set_title(f'{var} Distribution in No Discount Group')
    
    # Treatment Group Pie Chart
    axes[1].pie(treatment_dist, labels=treatment_dist.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
    axes[1].set_title(f'{var} Distribution in Discount Group')
    
    # Display the plots
    plt.suptitle(f'Comparison of {var} Distribution between Control and Treatment Groups')
    plt.show()


In [None]:
# Compare the mean CLV of the test and control groups for each feature
features_to_test = ['Gender', 'City', 'Membership Type', 
                    'Discount_Satisfaction_Interaction']

In [None]:
control_mean = control.groupby('City')['Total Spend'].mean()
print(control_mean)
test_mean = treatment.groupby('City')['Total Spend'].mean()
print(test_mean)
uplift = test_mean - control_mean
print(f"Uplift in CLV for 'City': \n{uplift}\n")

In [None]:
print("Statistical Analysis of Features' Response to Discount:")
for feature in features_to_test:
    control_mean = control.groupby(feature)['Total Spend'].mean()
    test_mean = treatment.groupby(feature)['Total Spend'].mean()
    uplift = test_mean - control_mean
    print(f"Uplift in CLV for {feature}: \n{uplift}\n")

In [None]:
# Define the features and target for treatment group
X_treatment = treatment.drop(columns = "Total Spend")
y_treatment = treatment['Total Spend']

In [None]:
# Define the features and target
X_control = control.drop(columns = "Total Spend")
y_control = control['Total Spend']

In [None]:
# Initialize models
treatment_model = GradientBoostingRegressor(random_state=42)
control_model = GradientBoostingRegressor(random_state=42)

# Train models
treatment_model.fit(X_treatment, y_treatment)
control_model.fit(X_control, y_control)

In [None]:
# Predict CLV for both groups
treatment_predictions = treatment_model.predict(X_treatment)
control_predictions = control_model.predict(X_control)

# Calculate the expected uplift for each customer
data['Predicted_CLV_Treatment'] = treatment_model.predict(data[X_treatment.columns])
data['Predicted_CLV_Control'] = control_model.predict(data[X_control.columns])
data['Uplift'] = data['Predicted_CLV_Treatment'] - data['Predicted_CLV_Control']

data[['Predicted_CLV_Treatment','Predicted_CLV_Control','Uplift']].head()

In [None]:
data[['Predicted_CLV_Treatment','Predicted_CLV_Control','Uplift']].describe()

In [None]:
uplift_threshold = data['Uplift'].quantile(0.50)  # Adjust this threshold as needed
data['Target_for_Discount'] = data['Uplift'] > uplift_threshold
data.head()

In [None]:
# Uplift distribution
sns.histplot(data['Uplift'], kde=True)
plt.title('Uplift Distribution')
plt.xlabel('Uplift (Predicted CLV Treatment - Predicted CLV Control)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Percentage of customers targeted for discount
target_ratio = data['Target_for_Discount'].mean() * 100
print(f"\nPercentage of customers identified to receive a discount: {target_ratio:.2f}%")

In [None]:
# Calculate the average uplift for customers who are targeted
average_uplift = data[data['Target_for_Discount'] == True]['Uplift'].mean()
print(f"\nAverage uplift in CLV for targeted customers: ${average_uplift:.2f}")

# Potential increase in revenue if targeted customers receive discounts
potential_increase = average_uplift * data['Target_for_Discount'].sum()
print(f"Potential increase in revenue from targeted discounts: ${potential_increase:.2f}")

In [None]:
if hasattr(treatment_model, 'feature_importances_'):
    feature_importance = pd.Series(treatment_model.feature_importances_, index=X_control.columns).sort_values(ascending=False)
    sns.barplot(x=feature_importance.values, y=feature_importance.index)
    plt.title('Feature Importance for Treatment Group')
    plt.xlabel('Importance Score')
    plt.ylabel('Features')
    plt.show()

print(feature_importance)

In [None]:
# Create groups
control_group = data[data['Discount Applied'] == 0]['Total Spend']
test_group = data[data['Discount Applied'] == 1]['Total Spend']

In [None]:
t_stat, p_value = stats.ttest_ind(control_group, test_group, equal_var=False)
print(f'\nA/B Testing Results:')
print(f'Test Group Mean CLV: {test_group.mean():.2f}')
print(f'Control Group Mean CLV: {control_group.mean():.2f}')
print(f'T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}')