In [1]:
# LTV Prediction using XGBoost and Random Forest

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score



ModuleNotFoundError: No module named 'xgboost'

In [None]:
# 📥 Step 2: Load Dataset
df = pd.read_csv("digital_wallet_ltv_dataset.csv")  # replace with actual file name

# 🧹 Step 3: Data Cleaning
df.dropna(subset=['Customer Lifetime Value'], inplace=True)  # Remove rows with no target

# Fill or drop other missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# 👁️ Step 4: EDA (Optional, expand as needed)
print(df.describe())
sns.histplot(df['Customer Lifetime Value'], kde=True)
plt.title('Customer Lifetime Value Distribution')
plt.show()

# 🧠 Step 5: Feature Encoding
# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# 🎯 Step 6: Prepare Features and Target
X = df.drop(columns=['Customer Lifetime Value'])
y = df['Customer Lifetime Value']

# Feature scaling (optional but often useful)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🧪 Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 🌲 Step 8: Train Models
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# XGBoost
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)

# 🧾 Step 9: Evaluation
def evaluate_model(name, y_true, y_pred):
    print(f"Model: {name}")
    print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}")
    print("-" * 30)

evaluate_model("Random Forest", y_test, rf_preds)
evaluate_model("XGBoost", y_test, xgb_preds)

# 📈 Step 10: Feature Importance (XGBoost)
xgb_feat_imp = pd.Series(xgb.feature_importances_, index=df.drop(columns='Customer Lifetime Value').columns)
xgb_feat_imp.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features (XGBoost)")
plt.show()

# 📤 Step 11: Export Predictions
output_df = pd.DataFrame({
    'Actual_LTV': y_test,
    'Predicted_LTV_RF': rf_preds,
    'Predicted_LTV_XGB': xgb_preds
})
output_df.to_csv("ltv_predictions.csv", index=False)
print("Predictions saved to ltv_predictions.csv")
