In [4]:
import joblib
import pandas as pd
import numpy as np

In [6]:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [7]:
# The Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [19]:
# Load Data
X_train = joblib.load('D:\clv\model_data\X_train.pkl')
X_test = joblib.load('D:\clv\model_data\X_test.pkl')
y_train = joblib.load('D:\clv\model_data\y_train.pkl')
y_test = joblib.load('D:\clv\model_data\y_test.pkl')

In [20]:
X_train.columns

Index(['TotalQty', 'AvgUnitPrice', 'Monetary_Value'], dtype='object')

## The Evaluation Method

In [11]:
# The Evaluation Method
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(f"--- {model_name} Performance ---")
    print(f"MAE:  {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²:   {r2:.4f}\n")

    return mae, rmse, r2

### Baseline 1 - Linear Regression

In [21]:
linreg = LinearRegression()
linreg.fit(X_train,y_train)
linreg_preds = linreg.predict(X_test)

linreg_metrics = evaluate_model(y_test, linreg_preds, "Linear Regression")

--- Linear Regression Performance ---
MAE:  746.26
RMSE: 3100.92
R²:   0.7377



### Baseline 2 - Random Forest

In [9]:
rf = RandomForestRegressor(
    n_estimators=25,
    max_depth=10,
    random_state=23,
    n_jobs=-1
)

rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

rf_metrics = evaluate_model(y_test, rf_preds, "Random Forest")

--- Random Forest Performance ---
MAE:  729.48
RMSE: 3231.34
R²:   0.7152



#### The Huge difference between MAE and Rmse proves that there are some infrequent big erros . maybe some customers that purchased heavily but only once or twice .

In [10]:
# which features are most important ??
rf.feature_importances_

array([0.10521548, 0.04575928, 0.84902524])

### Random Forest tells that Monetary is the biggest driver of the CLV

### Model 3 - XGBoost Baseline

In [11]:
from xgboost import XGBRegressor
import numpy as np

# 1. Compress the target variable to hide the extreme spikes
y_train_log = np.log1p(y_train)

# 2. Initialize the exact same XGBoost model
xgb_model_log = XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    n_jobs=-1,
    random_state=42
)

# 3. Train the model on the LOGGED target
xgb_model_log.fit(X_train, y_train_log)

# 4. Predict on the test set (Warning: these predictions are in log format!)
xgb_preds_log = xgb_model_log.predict(X_test)

# 5. Reverse the math to turn log predictions back into real currency
xgb_preds_dollars = np.expm1(xgb_preds_log)

# 6. Evaluate the real dollar predictions against the original y_test
xgb_metrics_log = evaluate_model(y_test, xgb_preds_dollars, "XGBoost (Target Transformed)")

--- XGBoost (Target Transformed) Performance ---
MAE:  825.30
RMSE: 5526.11
R²:   0.1669



## LightGBM & CatBoost Baseline

In [None]:
!pip install catboost
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# 1. LightGBM: Fast and memory-efficient
lgbm_model = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)
evaluate_model(y_test, lgbm_preds, "LightGBM")

# 2. CatBoost: Robust to outliers and very accurate
# 'silent=True' keeps the output clean since CatBoost likes to print a lot of logs
cat_model = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, silent=True)
cat_model.fit(X_train, y_train)
cat_preds = cat_model.predict(X_test)
evaluate_model(y_test, cat_preds, "CatBoost")

## Huber Regression

In [12]:
from sklearn.linear_model import HuberRegressor

# Initialize and train
huber = HuberRegressor(max_iter=1000)
huber.fit(X_train, y_train)

# Predict and evaluate
huber_preds = huber.predict(X_test)
evaluate_model(y_test, huber_preds, "Huber Regression")

--- Huber Regression Performance ---
MAE:  681.86
RMSE: 3619.98
R²:   0.6425



(681.863281347642, np.float64(3619.9762503912075), 0.6425234762221268)

# After Careful Testing of different algorithms .
# Winner : Linear Regression

# HyperParameter Tuning

### Since there are no hyperparameters in the Linear Regression we Skip this step

In [14]:
# Saving the model

joblib.dump(linreg, 'D:\clv\models\linreg.pkl' )

print('Models Saved ')

Models Saved 
