In [73]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_parquet('../data/processed/customer_churn_large_dataset.parquet')
df.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Avg_Usage_GB,Cost_Per_GB
0,0.865385,1.0,0.5,0.695652,0.619429,0.413333,0.0,0.023697,0.114158
1,0.846154,0.0,1.0,0.0,0.268,0.271111,0.0,0.341255,0.004859
2,0.115385,0.0,0.5,0.173913,0.792429,0.911111,0.0,0.180586,0.018968
3,0.346154,0.0,0.75,0.086957,0.970571,0.548889,1.0,0.194644,0.020285
4,0.538462,0.0,0.75,0.782609,0.402,0.48,0.0,0.023933,0.089427


In [43]:
# Dividing the data into x and y and furthur train and test
y = df['Churn']
X = df.drop(columns=['Churn'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)

In [71]:
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

train_auc = roc_auc_score(y_predict_train, y_train)
test_auc = roc_auc_score(y_predict_test, y_test)
print(f"Train AUC: {train_auc:.4f}, Test AUC: {test_auc:.4f}")

Train AUC: 0.5029, Test AUC: 0.5007


In [74]:
rf = RandomForestClassifier()
model = rf.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

train_auc = roc_auc_score(y_predict_train, y_train)
test_auc = roc_auc_score(y_predict_test, y_test)
print(f"Train AUC: {train_auc:.4f}, Test AUC: {test_auc:.4f}")

Train AUC: 1.0000, Test AUC: 0.5053


In [44]:
# Converting train and test data into dmatrix for xgboost use
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dtest = xgb.DMatrix(data = X_test, label = y_test)

In [None]:
# Training parameters with tweaked values
xgb_params = {
    'objective': 'binary:logistic',
    'max_depth': 6,  
    'learning_rate': 0.03,  
    'eval_metric': 'auc', 
    'subsample': 0.8,  
    'colsample_bytree': 0.8, 
    'seed': 42
}

# Evaluation function
def eval_function(preds, dtrain):
    labels = dtrain.get_label()
    auc_score = roc_auc_score(labels, preds)
    return 'custom_auc', auc_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store scores
train_auc_scores = []
val_auc_scores = []

# Perform k-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_train_fold = X_train.iloc[train_idx]
    y_train_fold = y_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_val_fold = y_train.iloc[val_idx]

    # Create DMatrix for XGBoost
    dtrain_fold = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval_fold = xgb.DMatrix(X_val_fold, label=y_val_fold)

    # Train XGBoost model
    model = xgb.train(xgb_params, dtrain_fold, num_boost_round=200)

    # Predict on training and validation sets
    y_pred_train = model.predict(dtrain_fold)
    y_pred_val = model.predict(dval_fold)

    # Calculate AUC scores
    train_auc = roc_auc_score(y_train_fold, y_pred_train)
    val_auc = roc_auc_score(y_val_fold, y_pred_val)

    train_auc_scores.append(train_auc)
    val_auc_scores.append(val_auc)

    print(f"Fold {fold+1}, Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

# Train a final model on the entire training set
final_model = xgb.train(xgb_params, dtrain, num_boost_round=200,custom_metric=eval_function,maximize=True)

# Predict on test data and fit on train data
y_pred_test = final_model.predict(dtest)
y_pred_train = final_model.predict(dtrain)

# AUC score for whole data
train_auc = roc_auc_score(y_train, y_pred_train)
test_auc = roc_auc_score(y_test, y_pred_test)
print(f"Train AUC: {train_auc:.4f}, Test AUC: {test_auc:.4f}")

Fold 1, Train AUC: 0.6989, Validation AUC: 0.4993
Fold 2, Train AUC: 0.6924, Validation AUC: 0.5026
Fold 3, Train AUC: 0.6902, Validation AUC: 0.4998
Fold 4, Train AUC: 0.6917, Validation AUC: 0.5011
Fold 5, Train AUC: 0.6932, Validation AUC: 0.5040
Train AUC: 0.6765, Test AUC: 0.5030
