In [None]:
#!/usr/bin/env python3
"""
Telco churn training script with data preparation, feature engineering,
two-model training (LogisticRegression, RandomForest), hyperparameter tuning,
evaluation, and best-model export.

Expected CSV columns:
customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService,
MultipleLines, InternetService, Contract, PaymentMethod, MonthlyCharges, TotalCharges, Churn
"""
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_auc_score)
import joblib
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

def load_and_clean(path):
    df = pd.read_csv(path)
    expected = ["customerID","gender","SeniorCitizen","Partner","Dependents","tenure",
                "PhoneService","MultipleLines","InternetService","Contract","PaymentMethod",
                "MonthlyCharges","TotalCharges","Churn"]
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in CSV: {missing}")

    # Clean whitespace
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str).str.strip()

    # Coerce numeric
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace("", np.nan), errors="coerce")
    df["SeniorCitizen"] = pd.to_numeric(df["SeniorCitizen"], errors="coerce")
    df["tenure"] = pd.to_numeric(df["tenure"], errors="coerce")
    df["MonthlyCharges"] = pd.to_numeric(df["MonthlyCharges"], errors="coerce")

    # Drop rows with target missing
    df = df.dropna(subset=["Churn"])

    # Drop extremely sparse rows if many missing
    thresh = int(df.shape[1] * 0.6)
    df = df.dropna(thresh=thresh)

    return df

def cap_outliers_iqr(df, cols, factor=1.5):
    # In-place cap using IQR
    for c in cols:
        if c not in df.columns:
            continue
        series = df[c].dropna()
        if series.empty:
            continue
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        low = q1 - factor * iqr
        high = q3 + factor * iqr
        df[c] = np.where(df[c] < low, low, df[c])
        df[c] = np.where(df[c] > high, high, df[c])
    return df

def build_preprocessor(numeric_features, categorical_features):
    # Numeric: median impute + robust scaling (resistant to outliers)
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", RobustScaler())
    ])

    # Categorical: most frequent impute + one-hot
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ], remainder="drop")

    return preprocessor

def main(args):
    df = load_and_clean(args.data)

    # Map target
    df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
    if df["Churn"].isnull().any():
        df = df.dropna(subset=["Churn"])

    # Drop id
    X = df.drop(columns=["customerID", "Churn"])
    y = df["Churn"].astype(int)

    # Numeric / categorical lists
    numeric_features = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]
    categorical_features = [c for c in X.columns if c not in numeric_features]

    # Handle outliers before pipeline (cap using IQR)
    X = cap_outliers_iqr(X.copy(), numeric_features, factor=1.5)

    preprocessor = build_preprocessor(numeric_features, categorical_features)

    # Pipeline includes optional dimensionality step (SelectKBest or PCA) and classifier placeholder
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("dimred", "passthrough"),      # will be set in GridSearchCV
        ("clf", LogisticRegression(max_iter=1000, solver="lbfgs"))
    ])

    param_grid = [
        # Logistic regression branch
        {
            "dimred": ["passthrough", SelectKBest(score_func=f_classif)],
            "dimred__k": [10, 15, "all"],
            "clf": [LogisticRegression(max_iter=2000, solver="liblinear")],
            "clf__C": [0.01, 0.1, 1.0, 10.0],
            "clf__penalty": ["l2"]
        },
        # Random forest branch
        {
            "dimred": ["passthrough", SelectKBest(score_func=f_classif), PCA()],
            "dimred__k": [10, 15, "all"],
            "dimred__n_components": [5, 10],  # only used if PCA selected; GridSearch ignores when not applicable
            "clf": [RandomForestClassifier(random_state=42, n_jobs=-1)],
            "clf__n_estimators": [100, 250],
            "clf__max_depth": [None, 10, 20]
        }
    ]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=42
    )

    # Grid search
    gs = GridSearchCV(pipe, param_grid, cv=StratifiedKFold(n_splits=5), scoring="f1",
                      n_jobs=-1, verbose=1, refit=True)
    gs.fit(X_train, y_train)

    best = gs.best_estimator_
    print(f"Best params: {gs.best_params_}")

    # Evaluate on test set
    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:, 1] if hasattr(best, "predict_proba") else None

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    }

    print("Evaluation on test set:")
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}" if v is not None else f"  {k}: None")
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, target_names=["No", "Yes"]))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Save best model
    model_out = args.model_out or "best_telco_churn_model.joblib"
    joblib.dump(best, model_out)
    print(f"Saved best model to: {model_out}")

    # Short fairness/bias reflection
    print("\nFairness notes:")
    print("- Check model performance across sensitive groups (e.g., gender, SeniorCitizen).")
    print("- Compare precision/recall for subgroups; consider reweighting or separate thresholds if disparities exist.")
    print("- Document dataset collection and known limitations before deployment.")

if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Train telco churn models with preprocessing and tuning")
    p.add_argument("--data", required=True, help="Path to telco CSV file")
    p.add_argument("--model-out", required=False, help="Output path for saved model (joblib)")
    args = p.parse_args()
    main(args)

In [None]:
#!/usr/bin/env python3
"""
Train and evaluate a churn model for a telecom dataset.

Expected CSV columns:
customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService,
MultipleLines, InternetService, Contract, PaymentMethod, MonthlyCharges, TotalCharges, Churn

Usage:
python telco_churn_model.py --data telco_churn.csv
"""

import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import joblib
import sys

def load_and_clean(path):
    df = pd.read_csv(path)
    expected = ["customerID","gender","SeniorCitizen","Partner","Dependents","tenure",
                "PhoneService","MultipleLines","InternetService","Contract","PaymentMethod",
                "MonthlyCharges","TotalCharges","Churn"]
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in CSV: {missing}")

    # Coerce TotalCharges to numeric (some datasets have blanks)
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

    # Optionally drop rows with missing target
    df = df.dropna(subset=["Churn"])

    # Some datasets have spaces in categorical values; strip strings
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()

    return df

def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("clf", clf)])
    return pipe

def main(args):
    df = load_and_clean(args.data)

    # Drop identifier
    X = df.drop(columns=["customerID", "Churn"])
    y = df["Churn"].map({"Yes": 1, "No": 0})  # map target

    # Ensure numeric columns are numeric
    # 'SeniorCitizen' sometimes stored as int or object; coerce to numeric
    X["SeniorCitizen"] = pd.to_numeric(X["SeniorCitizen"], errors="coerce")

    numeric_features = ["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]
    categorical_features = [c for c in X.columns if c not in numeric_features]

    # Build pipeline
    pipe = build_pipeline(numeric_features, categorical_features)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Fit
    pipe.fit(X_train, y_train)

    # Evaluate
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe.named_steps['clf'], "predict_proba") else None

    print("Accuracy:", accuracy_score(y_test, y_pred))
    if y_proba is not None:
        print("ROC AUC:", roc_auc_score(y_test, y_proba))
    print("Classification report:")
    print(classification_report(y_test, y_pred, target_names=["No", "Yes"]))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Save model
    model_out = args.model_out or "telco_churn_model.joblib"
    joblib.dump(pipe, model_out)
    print(f"Saved trained pipeline to: {model_out}")

    # Optional: show top feature importances (best-effort)
    try:
        clf = pipe.named_steps["clf"]
        pre = pipe.named_steps["preprocessor"]
        # get feature names after one-hot
        ohe = pre.named_transformers_["cat"].named_steps["onehot"]
        cat_cols = pre.transformers_[1][2]  # categorical feature names list
        # feature names for ohe
        ohe_names = list(ohe.get_feature_names_out(cat_cols))
        feature_names = numeric_features + ohe_names
        importances = clf.feature_importances_
        top_idx = np.argsort(importances)[::-1][:10]
        print("Top feature importances:")
        for i in top_idx:
            print(f" {feature_names[i]}: {importances[i]:.4f}")
    except Exception:
        pass

    # If user asked to show example prediction, predict for first test row
    example = X_test.iloc[[0]]
    proba = pipe.predict_proba(example)[0][1] if hasattr(pipe.named_steps['clf'], "predict_proba") else None
    print("\nExample customer (first test row):")
    print(example.to_dict(orient="records")[0])
    if proba is not None:
        print(f"Predicted churn probability: {proba:.3f}")
    print("Predicted class:", "Yes" if pipe.predict(example)[0] == 1 else "No")

if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Train a telco churn model")
    p.add_argument("--data", required=True, help="Path to telco CSV file")
    p.add_argument("--model-out", required=False, help="Output path for saved model (joblib)")
    args = p.parse_args()
    main(args)