In [18]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import uuid
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import joblib
import plotly.express as px
import shap
import optuna

In [20]:
# Fetching product data from DummyJSON API
url = "https://dummyjson.com/products?limit=100"
response = requests.get(url)

if response.status_code != 200:
    print("Failed to fetch DummyJSON data")
    exit()

categories = ['beauty', 'fragrances', 'furniture', 'groceries', 'home-decoration', 'kitchen-accessories', 'laptops', 'mens-shirts', 'mens-shoes', 'mens-watches', 'mobile-accessories', 'motorcycle', 'skin-care', 'smartphones', 'sports-accessories', 'sunglasses', 'tablets', 'tops', 'vehicle', 'womens-bags', 'womens-dresses', 'womens-jewellery', 'womens-shoes', 'womens-watches']
brands = ['Apple', 'Samsung', 'Nike', 'Adidas', 'Sony', 'Essence', 'Chanel', 'Dior', 'Gucci', 'Annibale Colombo', 'Unknown', 'Glamour Beauty', 'Velvet Touch', 'Chic Cosmetics', 'Nail Couture', 'Calvin Klein', 'Dolce & Gabbana', 'Asus', 'Huawei', 'Lenovo', 'Dell', 'Fashion Trends', 'Gigabyte', 'Classic Wear', 'Casual Comfort', 'Urban Chic', 'Puma', 'Off White', 'Fashion Timepieces', 'Longines']
df_products = pd.DataFrame({
    'product_id': range(1, 101),
    'category_code': np.random.choice(categories, size=100),
    'brand': np.random.choice(brands, size=100),
    'price': np.round(np.random.uniform(0.99, 1999.99, 100), 2)
})
df_products['category_id'] = [int(f"21444159{str(i).zfill(10)}") for i in range(len(df_products))]

# Parameters
num_records = 100_000
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 7, 22, 10, 17)
time_diff = (end_date - start_date).total_seconds()

# Event types and probabilities
event_types = ['view', 'cart', 'purchase']
event_probs = [0.7, 0.2, 0.1]

# Simulate countries
countries = ['USA', 'UK', 'India', 'Germany', 'Brazil', 'Unknown']
country_probs = [0.4, 0.2, 0.2, 0.1, 0.05, 0.05]

# Generate data with smaller user_id range for ~10 events/user on average
rng = np.random.default_rng()
user_id = rng.integers(1, 10001, size=num_records, dtype='int64')  # Reduced range for more events per user
random_seconds = np.random.uniform(0, time_diff, num_records)
event_times = [start_date + pd.Timedelta(seconds=sec) for sec in random_seconds]
event_times = pd.to_datetime(event_times).strftime('%Y-%m-%d %H:%M:%S UTC')
event_type = np.random.choice(event_types, size=num_records, p=event_probs)
product_indices = np.random.randint(0, len(df_products), num_records)
product_data = df_products.iloc[product_indices].reset_index(drop=True)
category_code = np.where(np.random.random(num_records) > 0.1, product_data['category_code'], np.nan)
brand = np.where(np.random.random(num_records) > 0.1, product_data['brand'], np.nan)
session_ids = [str(uuid.uuid4())[:12] for _ in range(num_records)]
countries_assigned = np.random.choice(countries, size=num_records, p=country_probs)

# Create DataFrame
df = pd.DataFrame({
    'event_time': event_times,
    'event_type': event_type,
    'product_id': product_data['product_id'],
    'category_id': product_data['category_id'],
    'category_code': category_code,
    'user_id': user_id,
    'brand': brand,
    'price': product_data['price'],
    'user_session': session_ids,
    'country': countries_assigned
})

# Sort by event_time
df['event_time'] = pd.to_datetime(df['event_time'])
df = df.sort_values('event_time').reset_index(drop=True)

In [12]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,user_id,brand,price,user_session,country
0,2024-01-01 00:02:05+00:00,view,14,214441590000000013,fragrances,622,Annibale Colombo,269.05,d555be45-f78,UK
1,2024-01-01 00:07:58+00:00,view,48,214441590000000047,mens-shoes,2123,Glamour Beauty,302.16,4d6acd17-47b,Brazil
2,2024-01-01 00:28:13+00:00,cart,53,214441590000000052,mens-watches,9295,Gucci,277.76,ff7a1679-835,USA
3,2024-01-01 00:32:03+00:00,view,10,214441590000000009,tops,9151,Longines,1952.76,8a1e0c30-2e6,USA
4,2024-01-01 00:38:19+00:00,view,2,214441590000000001,laptops,3434,Nail Couture,1490.43,b885ade4-7ad,Brazil


In [21]:
# Deleting outliers in price using IQR method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df = df[df['price'] < Q3 + 1.5 * IQR]

# Handle missing values
df['brand'] = df['brand'].fillna('Unknown')
df = df.dropna(subset=['user_session'])

In [22]:
df['event_time'] = pd.to_datetime(df['event_time'])

train_cutoff = pd.to_datetime('2025-06-30').tz_localize('UTC')

# Split data
df_train = df[df['event_time'] <= train_cutoff]
df_test = df[df['event_time'] > train_cutoff]

# Observation cutoff for features (90 days before train_cutoff)
observation_cutoff = train_cutoff - pd.Timedelta(days=90)

# Reset index to ensure consecutive indices
df_features = df_features.reset_index(drop=True)

# Active users in label period
active_users_label_period = set(df_train[(df_train['event_time'] > observation_cutoff)]['user_id'].unique())

# Users with events before observation
all_users = set(df_features['user_id'].unique())

# Churn label: 1 if no activity in label period
last_event = df_features.groupby('user_id')['event_time'].max().reset_index(name='last_event')
churn = (~last_event['user_id'].isin(active_users_label_period)).astype(int)
last_event['churn'] = churn

In [23]:
df_train.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,user_id,brand,price,user_session,country
0,2024-01-01 00:10:34+00:00,view,51,214441590000000050,furniture,9840,Nike,224.11,8ce07801-634,India
1,2024-01-01 00:44:59+00:00,cart,68,214441590000000067,mobile-accessories,5698,Nike,992.25,eeb2178d-eca,India
2,2024-01-01 00:45:08+00:00,purchase,60,214441590000000059,kitchen-accessories,9459,Classic Wear,1616.24,80ff9ad4-be1,Germany
3,2024-01-01 00:45:57+00:00,view,54,214441590000000053,mobile-accessories,1408,Fashion Trends,1190.9,69d76943-946,India
4,2024-01-01 00:56:58+00:00,view,32,214441590000000031,mens-shoes,6950,Unknown,734.68,e892243a-0dc,USA


In [None]:
df = df_train.copy()
# Features on df_features
churn_features = df_features.groupby('user_id').agg({
    'event_type': [
        ('total_purchases', lambda x: (x == 'purchase').sum()),
        ('total_views', lambda x: (x == 'view').sum()),
        ('total_carts', lambda x: (x == 'cart').sum())
    ],
    'user_session': [('session_counts', 'nunique')],
    'event_time': [
        ('session_duration_minutes', lambda x: (x.max() - x.min()).total_seconds() / 60 if len(x) > 1 else 0),
        ('time_since_last_event', lambda x: (observation_cutoff - x.max()).days)
    ],
    # Compute avg_purchase_value using grouped data
    'price': [
        ('avg_purchase_value', lambda x, event_type=df_features['event_type']: x[event_type[x.index] == 'purchase'].mean() if (event_type[x.index] == 'purchase').any() else 0)
    ],
    'country': [('most_frequent_country', lambda x: x.mode().iloc[0] if not x.empty else 'Unknown')],
    'category_code': [('most_frequent_category', lambda x: x.mode().iloc[0] if not x.empty else 'Unknown')],
    'brand': [('most_frequent_brand', lambda x: x.mode().iloc[0] if not x.empty else 'Unknown')]
}).reset_index()

# Flatten the multi-index columns
churn_features.columns = ['user_id', 'total_purchases', 'total_views', 'total_carts', 'session_counts',
                          'session_duration_minutes', 'time_since_last_event', 'avg_purchase_value',
                          'most_frequent_country', 'most_frequent_category', 'most_frequent_brand']

# Adjusted purchase_frequency (no leakage)
churn_features['purchase_frequency'] = churn_features['total_purchases'] / (churn_features['session_duration_minutes'] + 1e-6)
churn_features['session_decay'] = churn_features['session_counts'] / (churn_features['session_duration_minutes'] + 1e-6)
churn_features['cart_to_purchase_ratio'] = churn_features['total_carts'] / (churn_features['total_purchases'] + 1)

unique_categories_count = df_features.groupby('user_id')['category_code'].nunique().reset_index(name='unique_categories_count')
df_features['month'] = df_features['event_time'].dt.to_period('M')
product_trends = df_features[df_features['event_type'] == 'purchase'].groupby(['user_id', 'month']).size().unstack(fill_value=0)
product_trends = product_trends.mean(axis=1).reset_index(name='avg_monthly_purchases')

churn_features = churn_features.merge(unique_categories_count, on='user_id', how='left')
churn_features = churn_features.merge(product_trends, on='user_id', how='left')
churn_features = churn_features.merge(last_event[['user_id', 'churn']], on='user_id', how='left')

churn_features['avg_monthly_purchases'] = churn_features['avg_monthly_purchases'].fillna(0)
churn_features['avg_purchase_value'] = churn_features['avg_purchase_value'].fillna(0)
churn_features['unique_categories_count'] = churn_features['unique_categories_count'].fillna(0)
churn_features['time_since_last_event'] = churn_features['time_since_last_event'].fillna(90)  # Impute with median-ish value

churn_features.to_csv('churn_features.csv', index=False)
print('Features saved: churn_features.csv')
print(churn_features.head())

IndexError: single positional indexer is out-of-bounds

In [8]:
churn_features.head()

Unnamed: 0,user_id,total_purchases,total_views,total_carts,session_counts,session_duration_minutes,time_since_last_purchase,avg_purchase_value,most_frequent_country,most_frequent_category,most_frequent_brand,purchase_frequency,purchase_decay,session_decay,cart_to_purchase_ratio,unique_categories_count,avg_monthly_purchases,churn
0,1515915625500000665,1,0,0,1,0.0,258,5.99,India,groceries,Unknown,1728913000.0,6701213.0,1000000.0,0.0,1,0.055556,1
1,1515915625500002259,0,0,1,1,0.0,536,0.0,Germany,groceries,Unknown,0.0,0.0,1000000.0,1.0,1,0.0,1
2,1515915625500005095,0,0,1,1,0.0,347,0.0,Unknown,groceries,Unknown,0.0,0.0,1000000.0,1.0,1,0.0,1
3,1515915625500005966,1,0,0,1,0.0,107,39.99,USA,kitchen-accessories,Unknown,1741990000.0,16280280.0,1000000.0,0.0,1,0.055556,1
4,1515915625500008551,0,0,1,1,0.0,61,0.0,USA,beauty,Chic Cosmetics,0.0,0.0,1000000.0,1.0,1,0.0,0


In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
import joblib
import plotly.express as px
import shap
import optuna

# Assume df_train and last_event are defined from earlier cells
def train_model():
    imputer = SimpleImputer(strategy='median')
    num_cols = churn_features.select_dtypes(include=['float', 'int64']).columns
    churn_features[num_cols] = imputer.fit_transform(churn_features[num_cols])

    str_cols = churn_features.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    for col in str_cols:
        churn_features[col] = encoder.fit_transform(churn_features[col].astype(str))

    X = churn_features.drop(columns=['churn', 'user_id'])
    y = churn_features['churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define objective function with original scaled data
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 5, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)
        }
        model = xgb.XGBClassifier(**params, random_state=42, eval_metric='logloss')
        return cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1').mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    best_params = study.best_params

    # Resample after tuning
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    print(f"Resampled class distribution: {Counter(y_train_resampled)}")

    scale_pos_weight = Counter(y_train)[0] / Counter(y_train)[1] if Counter(y_train)[0] > 0 else 1.0
    xgb_model = xgb.XGBClassifier(**best_params, scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
    xgb_model.fit(pd.DataFrame(X_train_resampled, columns=X.columns), y_train_resampled)  # Use DataFrame with columns

    lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced', verbose=-1)
    lgb_model.fit(X_train_resampled, y_train_resampled)

    nn_model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(128, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.3),
        Dense(64, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=64, validation_split=0.2, verbose=1)

    # Evaluate
    print('XGBoost Classification Report:')
    y_pred_xgb = xgb_model.predict(X_test_scaled)
    print(classification_report(y_test, y_pred_xgb, zero_division=0))
    print(f'XGBoost ROC-AUC: {roc_auc_score(y_test, xgb_model.predict_proba(X_test_scaled)[:, 1]):.2f}')

    print('LightGBM Classification Report:')
    y_pred_lgb = lgb_model.predict(X_test_scaled)
    print(classification_report(y_test, y_pred_lgb, zero_division=0))
    print(f'LightGBM ROC-AUC: {roc_auc_score(y_test, lgb_model.predict_proba(X_test_scaled)[:, 1]):.2f}')

    print('Neural Network Classification Report:')
    y_pred_nn = (nn_model.predict(X_test_scaled) > 0.5).astype(int)
    print(classification_report(y_test, y_pred_nn, zero_division=0))
    print(f'Neural Network ROC-AUC: {roc_auc_score(y_test, nn_model.predict(X_test_scaled).flatten()):.2f}')

    # Time-series CV
    tscv = TimeSeriesSplit(n_splits=5)
    xgb_cv_scores = cross_val_score(xgb_model, X_train_resampled, y_train_resampled, cv=tscv, scoring='f1')
    print(f'XGBoost Time-Series Cross-Validation F1 Score: {xgb_cv_scores.mean():.2f} ± {xgb_cv_scores.std():.2f}')

    # SHAP and save
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_test_scaled)
    shap.summary_plot(shap_values, X_test, show=False)
    import matplotlib.pyplot as plt
    plt.savefig('shap_summary.png')
    plt.close()

    # Save model and feature names
    joblib.dump(xgb_model, 'churn_model.pkl')
    with open('feature_names.txt', 'w') as f:
        f.write('\n'.join(X.columns))
    fig = px.bar(x=xgb_model.feature_importances_, y=X.columns, title='XGBoost Feature Importance')
    fig.update_layout(xaxis_title='Importance', yaxis_title='Feature')
    fig.write_html('feature_importance.html')
    print('Model, SHAP plot, and feature importance saved: churn_model.pkl, shap_summary.png, feature_importance.html')

if __name__ == '__main__':
    train_model()

[I 2025-08-25 06:56:29,816] A new study created in memory with name: no-name-6bec2e49-7b8e-4e57-adff-8d36af65f04b
[I 2025-08-25 06:57:27,001] Trial 0 finished with value: 1.0 and parameters: {'max_depth': 6, 'learning_rate': 0.13309067227929672, 'n_estimators': 1131, 'subsample': 0.6842345947716565, 'colsample_bytree': 0.8910038630104786, 'reg_alpha': 0.9007974446887231, 'reg_lambda': 0.4640227904400067}. Best is trial 0 with value: 1.0.
[I 2025-08-25 06:57:46,683] Trial 1 finished with value: 1.0 and parameters: {'max_depth': 10, 'learning_rate': 0.05889908213832631, 'n_estimators': 330, 'subsample': 0.7513416148243595, 'colsample_bytree': 0.9307571445694524, 'reg_alpha': 0.6521935743337004, 'reg_lambda': 0.19119210702306977}. Best is trial 0 with value: 1.0.
[I 2025-08-25 06:58:41,650] Trial 2 finished with value: 1.0 and parameters: {'max_depth': 9, 'learning_rate': 0.23908780171940378, 'n_estimators': 1052, 'subsample': 0.7723412292339549, 'colsample_bytree': 0.995989792092185, 're

Resampled class distribution: Counter({1.0: 54583, 0.0: 54583})
Epoch 1/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9348 - loss: 0.1645 - val_accuracy: 0.9971 - val_loss: 0.0378
Epoch 2/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9743 - loss: 0.0689 - val_accuracy: 0.9987 - val_loss: 0.0300
Epoch 3/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9835 - loss: 0.0442 - val_accuracy: 0.9977 - val_loss: 0.0183
Epoch 4/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9883 - loss: 0.0325 - val_accuracy: 0.9947 - val_loss: 0.0227
Epoch 5/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9902 - loss: 0.0264 - val_accuracy: 0.9994 - val_loss: 0.0076
Epoch 6/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9919 - loss: 0


X does not have valid feature names, but LGBMClassifier was fitted with feature names


X does not have valid feature names, but LGBMClassifier was fitted with feature names




LightGBM ROC-AUC: 1.00
Neural Network Classification Report:
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      2674
         1.0       1.00      1.00      1.00     13646

    accuracy                           1.00     16320
   macro avg       0.99      1.00      0.99     16320
weighted avg       1.00      1.00      1.00     16320

[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Neural Network ROC-AUC: 1.00



F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.



XGBoost Time-Series Cross-Validation F1 Score: 0.60 ± 0.49
Model, SHAP plot, and feature importance saved: churn_model.pkl, shap_summary.png, feature_importance.html


In [9]:
#New Code for model training and evaluation

def train_model():
    # Prepare data for ML
    imputer = SimpleImputer(strategy='median')
    num_cols = churn_features.select_dtypes(include=['float', 'int64']).columns
    churn_features[num_cols] = imputer.fit_transform(churn_features[num_cols])

    str_cols = churn_features.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    for col in str_cols:
        churn_features[col] = encoder.fit_transform(churn_features[col].astype(str))

    X = churn_features.drop(columns=['churn', 'user_id'])
    y = churn_features['churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define objective function with original scaled data
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 5, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 200, 1200),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)
        }
        model = xgb.XGBClassifier(**params, random_state=42, eval_metric='logloss')
        return cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1').mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    best_params = study.best_params

    # Resample after tuning
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    print(f"Resampled class distribution: {Counter(y_train_resampled)}")

    scale_pos_weight = Counter(y_train)[0] / Counter(y_train)[1] if Counter(y_train)[0] > 0 else 1.0
    xgb_model = xgb.XGBClassifier(**best_params, scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
    xgb_model.fit(pd.DataFrame(X_train_resampled, columns=X.columns), y_train_resampled)  # Use DataFrame with columns

    # Validate on test set
    y_pred_test = xgb_model.predict(X_test_scaled)
    print("Test Set Classification Report:")
    print(classification_report(y_test, y_pred_test, zero_division=0))
    print(f"Test Set ROC-AUC: {roc_auc_score(y_test, xgb_model.predict_proba(X_test_scaled)[:, 1]):.2f}")

    lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced', verbose=-1)
    lgb_model.fit(X_train_resampled, y_train_resampled)

    nn_model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(128, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.3),
        Dense(64, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.3),
        Dense(32, activation='relu', kernel_initializer='he_normal'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    nn_model.fit(X_train_resampled, y_train_resampled, epochs=50, batch_size=64, validation_split=0.2, verbose=1)

    # Evaluate
    print('XGBoost Classification Report:')
    y_pred_xgb = xgb_model.predict(X_test_scaled)
    print(classification_report(y_test, y_pred_xgb, zero_division=0))
    print(f'XGBoost ROC-AUC: {roc_auc_score(y_test, xgb_model.predict_proba(X_test_scaled)[:, 1]):.2f}')

    print('LightGBM Classification Report:')
    y_pred_lgb = lgb_model.predict(X_test_scaled)
    print(classification_report(y_test, y_pred_lgb, zero_division=0))
    print(f'LightGBM ROC-AUC: {roc_auc_score(y_test, lgb_model.predict_proba(X_test_scaled)[:, 1]):.2f}')

    print('Neural Network Classification Report:')
    y_pred_nn = (nn_model.predict(X_test_scaled) > 0.5).astype(int)
    print(classification_report(y_test, y_pred_nn, zero_division=0))
    print(f'Neural Network ROC-AUC: {roc_auc_score(y_test, nn_model.predict(X_test_scaled).flatten()):.2f}')

    # Time-series CV
    tscv = TimeSeriesSplit(n_splits=5)
    xgb_cv_scores = cross_val_score(xgb_model, X_train_resampled, y_train_resampled, cv=tscv, scoring='f1')
    print(f'XGBoost Time-Series Cross-Validation F1 Score: {xgb_cv_scores.mean():.2f} ± {xgb_cv_scores.std():.2f}')

    # SHAP and save
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(X_test_scaled)
    shap.summary_plot(shap_values, X_test, show=False)
    import matplotlib.pyplot as plt
    plt.savefig('shap_summary.png')
    plt.close()

    # Save model and feature names
    joblib.dump(xgb_model, 'churn_model.pkl')
    with open('feature_names.txt', 'w') as f:
        f.write('\n'.join(X.columns))
    fig = px.bar(x=xgb_model.feature_importances_, y=X.columns, title='XGBoost Feature Importance')
    fig.update_layout(xaxis_title='Importance', yaxis_title='Feature')
    fig.write_html('feature_importance.html')
    print('Model, SHAP plot, and feature importance saved: churn_model.pkl, shap_summary.png, feature_importance.html')

if __name__ == '__main__':
    train_model()

[I 2025-08-28 09:16:28,443] A new study created in memory with name: no-name-350bbabc-1f3d-49d8-94eb-01a3ab8f0699
[I 2025-08-28 09:16:57,413] Trial 0 finished with value: 0.9989657089500634 and parameters: {'max_depth': 6, 'learning_rate': 0.2640525966550623, 'n_estimators': 804, 'subsample': 0.9862152005443106, 'colsample_bytree': 0.6466231219118656, 'reg_alpha': 0.6585498233862908, 'reg_lambda': 0.1030199230717661}. Best is trial 0 with value: 0.9989657089500634.
[I 2025-08-28 09:17:57,505] Trial 1 finished with value: 0.9989198656801952 and parameters: {'max_depth': 11, 'learning_rate': 0.011390624164262429, 'n_estimators': 1071, 'subsample': 0.9132754197096136, 'colsample_bytree': 0.9208347906861449, 'reg_alpha': 0.3988576460858346, 'reg_lambda': 0.03155760924839901}. Best is trial 0 with value: 0.9989657089500634.
[I 2025-08-28 09:18:29,181] Trial 2 finished with value: 0.9989840499353166 and parameters: {'max_depth': 8, 'learning_rate': 0.023265422997984606, 'n_estimators': 1038,

Resampled class distribution: Counter({1.0: 54594, 0.0: 54594})
Test Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      2706
         1.0       1.00      1.00      1.00     13649

    accuracy                           1.00     16355
   macro avg       0.99      1.00      1.00     16355
weighted avg       1.00      1.00      1.00     16355

Test Set ROC-AUC: 1.00
Epoch 1/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 9ms/step - accuracy: 0.9380 - loss: 0.1521 - val_accuracy: 0.9997 - val_loss: 0.0260
Epoch 2/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - accuracy: 0.9776 - loss: 0.0579 - val_accuracy: 0.9939 - val_loss: 0.0470
Epoch 3/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9861 - loss: 0.0428 - val_accuracy: 0.9990 - val_loss: 0.0139
Epoch 4/50
[1m1365/1365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[



              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2706
         1.0       1.00      1.00      1.00     13649

    accuracy                           1.00     16355
   macro avg       1.00      1.00      1.00     16355
weighted avg       1.00      1.00      1.00     16355

LightGBM ROC-AUC: 1.00
Neural Network Classification Report:
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      2706
         1.0       1.00      1.00      1.00     13649

    accuracy                           1.00     16355
   macro avg       0.99      1.00      0.99     16355
weighted avg       1.00      1.00      1.00     16355

[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Neural Network ROC-AUC: 1.00


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


XGBoost Time-Series Cross-Validation F1 Score: 0.60 ± 0.49
Model, SHAP plot, and feature importance saved: churn_model.pkl, shap_summary.png, feature_importance.html
