In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'C:\Users\allur\OneDrive\Desktop\project\customer_churn_sample_3000.csv')
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
TARGET = 'Churn'
if TARGET not in df.columns:
    raise ValueError("Expected 'Churn' in columns.")
df[TARGET] = df[TARGET].map({'Yes': 1, 'No': 0})
if df[TARGET].isna().any():
    raise ValueError("Churn mapping produced NaNs. Check unique in df['Churn'].")
print("shape:", df.shape)
print("Numeric columns:", df.select_dtypes(include=np.number).columns.tolist())
print("Non-numeric columns:", df.select_dtypes(exclude=np.number).columns.tolist())
df.head(3)

shape: (3000, 20)
Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
Non-numeric columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,Yes,16,Yes,No phone service,Fiber optic,No,No internet service,No internet service,Yes,Yes,No internet service,One year,No,Credit card,94.36,1522.96,0
1,Male,0,No,No,65,Yes,No phone service,No,Yes,No internet service,No internet service,No internet service,No,Yes,Month-to-month,Yes,Mailed check,96.22,6237.08,0
2,Male,1,Yes,No,10,No,No phone service,Fiber optic,No internet service,No,Yes,No,No,No internet service,Month-to-month,No,Bank transfer,79.7,797.02,0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from xgboost import XGBClassifier
import numpy as np
import joblib
import os
TARGET = 'Churn'
y = df[TARGET]
num_cols = df.select_dtypes(include=np.number).columns.drop(TARGET, errors='ignore').tolist()
cat_cols = [c for c in df.columns if c not in num_cols + [TARGET]]
X = df[num_cols + cat_cols]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
print("Shapes — train/valid/test:", X_train.shape, X_valid.shape, X_test.shape)
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
try:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ]
)
pos = int((y_train == 1).sum())
neg = int((y_train == 0).sum())
scale_pos = neg / max(pos, 1)

xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos,
)

xgb_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model),
])
xgb_pipe.fit(X_train, y_train)

y_val_proba = xgb_pipe.predict_proba(X_valid)[:, 1]
y_val_pred  = (y_val_proba >= 0.5).astype(int)

print("\n=== XGBoost (Validation) ===")
print("Accuracy:", round(accuracy_score(y_valid, y_val_pred), 4))
print("F1     :", round(f1_score(y_valid, y_val_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_valid, y_val_proba), 4))
print("\nReport:\n", classification_report(y_valid, y_val_pred))
os.makedirs("models", exist_ok=True)
joblib.dump(xgb_pipe, "models/baseline_xgb_pipeline.joblib")
print("Saved: models/baseline_xgb_pipeline.joblib")





Shapes — train/valid/test: (2100, 19) (450, 19) (450, 19)

=== XGBoost (Validation) ===
Accuracy: 0.54
F1     : 0.539
ROC-AUC: 0.532

Report:
               precision    recall  f1-score   support

           0       0.54      0.54      0.54       225
           1       0.54      0.54      0.54       225

    accuracy                           0.54       450
   macro avg       0.54      0.54      0.54       450
weighted avg       0.54      0.54      0.54       450

Saved: models/baseline_xgb_pipeline.joblib


In [3]:
# === Fixed Cell: 5-fold CV comparison (no groups) ===
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np, joblib

# 1) Load preprocessor/model from your saved baseline pipeline
loaded_pipe   = joblib.load("models/baseline_xgb_pipeline.joblib")
preprocessor  = loaded_pipe.named_steps['preprocessor']
xgb_model     = loaded_pipe.named_steps['model']  # reuse same params

# 2) CV setup (no groups)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"roc_auc": "roc_auc", "f1": "f1", "precision": "precision", "recall": "recall"}

# 3) Pipelines for CV
lr_bal = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=500, class_weight="balanced"))
])

xgb_cv_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

# 4) Run CV on TRAIN only
lr_cv  = cross_validate(lr_bal,     X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
xgb_cv = cross_validate(xgb_cv_pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

def summarize(name, res):
    print(f"\n{name} — 5-fold CV (TRAIN)")
    for metric in scoring.keys():
        mean = np.mean(res[f"test_{metric}"])
        std  = np.std(res[f"test_{metric}"])
        print(f"{metric:>9}: {mean:.4f} ± {std:.4f}")

summarize("Logistic Regression (balanced)", lr_cv)
summarize("XGBoost", xgb_cv)



Logistic Regression (balanced) — 5-fold CV (TRAIN)
  roc_auc: 0.5026 ± 0.0185
       f1: 0.4973 ± 0.0210
precision: 0.5013 ± 0.0157
   recall: 0.4938 ± 0.0301

XGBoost — 5-fold CV (TRAIN)
  roc_auc: 0.4994 ± 0.0173
       f1: 0.5027 ± 0.0140
precision: 0.5023 ± 0.0103
   recall: 0.5033 ± 0.0208


In [4]:
# === Cell 4: Quick XGBoost tuning with RandomizedSearchCV ===
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import numpy as np, joblib, os

# Safety checks (so the cell is self-contained)
assert 'preprocessor' in globals(), "Run the cell that creates `preprocessor` first."
assert all(v in globals() for v in ['X_train','y_train','X_valid','y_valid']), "Missing train/valid splits."
assert 'scale_pos' in globals(), "Compute scale_pos from y_train before running."

# Base model (uses your imbalance ratio)
xgb_base = XGBClassifier(
    random_state=42, n_jobs=-1, objective="binary:logistic",
    eval_metric="logloss", scale_pos_weight=scale_pos
)

# Pipeline for tuning (reuses your preprocessor)
xgb_pipe_rs = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_base)
])

# Parameter distributions to explore
param_dist = {
    "model__n_estimators": randint(300, 800),
    "model__max_depth": randint(3, 7),
    "model__learning_rate": uniform(0.03, 0.15),
    "model__subsample": uniform(0.7, 0.3),
    "model__colsample_bytree": uniform(0.7, 0.3),
    "model__reg_lambda": uniform(0.0, 2.0)
}

# Randomized search (5-fold CV, optimize ROC-AUC)
rs = RandomizedSearchCV(
    estimator=xgb_pipe_rs,
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring="roc_auc",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rs.fit(X_train, y_train)

print("\nBest CV ROC-AUC:", round(rs.best_score_, 4))
print("Best params:\n", rs.best_params_)

# Best model
best_xgb = rs.best_estimator_

# Validate best model on the hold-out validation set
val_proba = best_xgb.predict_proba(X_valid)[:, 1]
val_pred  = (val_proba >= 0.5).astype(int)

print("\n=== Tuned XGBoost (Validation) ===")
print("Accuracy:", round(accuracy_score(y_valid, val_pred), 4))
print("F1     :", round(f1_score(y_valid, val_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_valid, val_proba), 4))
print("\nReport:\n", classification_report(y_valid, val_pred))

# Save the tuned pipeline
os.makedirs("models", exist_ok=True)
joblib.dump(best_xgb, "models/best_xgb_pipeline.joblib")
print("Saved: models/best_xgb_pipeline.joblib")


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best CV ROC-AUC: 0.5077
Best params:
 {'model__colsample_bytree': np.float64(0.8481386789093172), 'model__learning_rate': np.float64(0.1084099244072991), 'model__max_depth': 5, 'model__n_estimators': 645, 'model__reg_lambda': np.float64(0.05083825348819038), 'model__subsample': np.float64(0.7323674280979913)}

=== Tuned XGBoost (Validation) ===
Accuracy: 0.5111
F1     : 0.5133
ROC-AUC: 0.519

Report:
               precision    recall  f1-score   support

           0       0.51      0.51      0.51       225
           1       0.51      0.52      0.51       225

    accuracy                           0.51       450
   macro avg       0.51      0.51      0.51       450
weighted avg       0.51      0.51      0.51       450

Saved: models/best_xgb_pipeline.joblib


In [5]:
# === Cell 5: Final Evaluation on Test Set ===
import joblib
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Load best tuned model
best_xgb_loaded = joblib.load("models/best_xgb_pipeline.joblib")

# Predict on test set
test_proba = best_xgb_loaded.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= 0.5).astype(int)

# Metrics
print("\n=== Final Tuned XGBoost (Test Set) ===")
print("Accuracy:", round(accuracy_score(y_test, test_pred), 4))
print("F1     :", round(f1_score(y_test, test_pred), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, test_proba), 4))
print("\nReport:\n", classification_report(y_test, test_pred))



=== Final Tuned XGBoost (Test Set) ===
Accuracy: 0.5156
F1     : 0.5281
ROC-AUC: 0.5255

Report:
               precision    recall  f1-score   support

           0       0.51      0.49      0.50       224
           1       0.52      0.54      0.53       226

    accuracy                           0.52       450
   macro avg       0.52      0.52      0.52       450
weighted avg       0.52      0.52      0.52       450

