In [2]:


import os
import shutil
import glob
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from IPython.display import display
from google.colab import files

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib

# Try XGBoost, else fallback to RandomForest
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    from sklearn.ensemble import RandomForestClassifier
    HAVE_XGB = False

# Paths
ROOT = '/content'
RAW_PATH = os.path.join(ROOT, 'data', 'raw')
PROCESSED_PATH = os.path.join(ROOT, 'data', 'processed')
MODEL_DIR = os.path.join(ROOT, 'models')
os.makedirs(RAW_PATH, exist_ok=True)
os.makedirs(PROCESSED_PATH, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print("Data directory:", RAW_PATH)
print("Model directory:", MODEL_DIR)
print("\n--- Upload your Telco CSV now when the browser dialog appears ---")

# Prompt user to upload file(s)
uploaded = files.upload()  # interactive browser upload

# Find uploaded CSV and move to canonical location
csv_path = os.path.join(RAW_PATH, 'telco_churn.csv')
uploaded_csvs = [name for name in uploaded.keys() if name.lower().endswith('.csv')]
if uploaded_csvs:
    # Move the first uploaded CSV to canonical path
    src_name = uploaded_csvs[0]
    try:
        # uploaded files are saved to current working directory in Colab, so move should work
        shutil.move(src_name, csv_path)
    except Exception:
        # Fallback: write bytes into canonical path
        with open(csv_path, 'wb') as f:
            f.write(uploaded[src_name])
    print("Uploaded dataset moved to:", csv_path)
else:
    print("No CSV uploaded. Will create a small synthetic demo dataset at:", csv_path)
    # Create synthetic demo dataset
    rng = np.random.RandomState(42)
    n = 500
    df_demo = pd.DataFrame({
        'customerID':[f'CUST{i:06d}' for i in range(n)],
        'gender': rng.choice(['Male','Female'], n),
        'SeniorCitizen': rng.choice([0,1], n, p=[0.85,0.15]),
        'Partner': rng.choice(['Yes','No'], n),
        'Dependents': rng.choice(['Yes','No'], n),
        'tenure': rng.randint(0,72,n),
        'PhoneService': rng.choice(['Yes','No'], n, p=[0.9,0.1]),
        'MultipleLines': rng.choice(['Yes','No','No phone service'], n),
        'InternetService': rng.choice(['DSL','Fiber optic','No'], n, p=[0.45,0.45,0.10]),
        'OnlineSecurity': rng.choice(['Yes','No','No internet service'], n),
        'OnlineBackup': rng.choice(['Yes','No','No internet service'], n),
        'DeviceProtection': rng.choice(['Yes','No','No internet service'], n),
        'TechSupport': rng.choice(['Yes','No','No internet service'], n),
        'StreamingTV': rng.choice(['Yes','No','No internet service'], n),
        'StreamingMovies': rng.choice(['Yes','No','No internet service'], n),
        'Contract': rng.choice(['Month-to-month','One year','Two year'], n, p=[0.6,0.25,0.15]),
        'PaperlessBilling': rng.choice(['Yes','No'], n),
        'PaymentMethod': rng.choice(['Electronic check','Mailed check','Bank transfer (automatic)','Credit card (automatic)'], n),
        'MonthlyCharges': np.round(rng.uniform(20,120,n),2)
    })
    # Calculate TotalCharges and ensure non-negative using clip(lower=0) — CORRECTED
    df_demo['TotalCharges'] = (df_demo['tenure'] * df_demo['MonthlyCharges']) + rng.normal(0,50,n)
    df_demo['TotalCharges'] = df_demo['TotalCharges'].clip(lower=0).round(2)
    # Generate churn probability and label
    prob = 0.15 + 0.25*(df_demo['Contract']=='Month-to-month').astype(float) + 0.001*(df_demo['MonthlyCharges']-20)
    prob = np.clip(prob, 0, 0.95)
    df_demo['Churn'] = np.where(rng.rand(n) < prob, 'Yes', 'No')
    df_demo.to_csv(csv_path, index=False)
    print("Synthetic dataset created.")

# Load CSV
try:
    df = pd.read_csv(csv_path)
    print("\nLoaded dataset rows:", df.shape[0], "cols:", df.shape[1])
except Exception as e:
    raise RuntimeError(f"Failed to read CSV at {csv_path}: {e}")

print("\nPreview (first 3 rows):")
display(df.head(3))

# Preprocessing function
def preprocess_telco(df_in):
    df = df_in.copy()
    # trim strings
    for c in df.select_dtypes(include=['object']).columns:
        df[c] = df[c].astype(str).str.strip()
    # TotalCharges numeric conversion
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    # Drop customerID
    if 'customerID' in df.columns:
        df = df.drop(columns=['customerID'])
    # Target churn mapping
    if 'Churn' in df.columns:
        # keep numeric if already binary 0/1
        if df['Churn'].dropna().isin([0,1]).all():
            df['churn'] = df['Churn']
        else:
            df['churn'] = df['Churn'].map({'Yes':1, 'No':0})
        df = df.drop(columns=['Churn'])
    # Binary map common
    binary_map = {'Yes':1, 'No':0, 'Male':1, 'Female':0}
    for col in list(df.columns):
        if df[col].dtype == 'object' and df[col].nunique() == 2:
            df[col] = df[col].map(binary_map)
    # One-hot for categorical (small cardinality)
    cat_cols = [c for c in df.select_dtypes(include=['object']).columns if df[c].nunique() <= 20]
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    # Impute numeric
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'churn' in num_cols:
        num_cols.remove('churn')
    if num_cols:
        imputer = SimpleImputer(strategy='median')
        df[num_cols] = imputer.fit_transform(df[num_cols])
    return df

# Preprocess and save processed CSV
processed = preprocess_telco(df)
processed_path = os.path.join(PROCESSED_PATH, 'telco_processed.csv')
processed.to_csv(processed_path, index=False)
print("\nProcessed data saved to:", processed_path)
print("Processed columns:", processed.columns.tolist())

# Ensure target exists
if 'churn' not in processed.columns:
    raise ValueError("Processed data does not contain target column 'churn'.")

# Prepare X and y
X = processed.drop(columns=['churn'])
y = processed['churn']
print("\nFeature matrix shape:", X.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# Build pipeline
scaler = StandardScaler()
if HAVE_XGB:
    model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, use_label_encoder=False, eval_metric='auc', random_state=42)
    print("\nUsing XGBoost model.")
else:
    # Ensure RandomForestClassifier is imported if we fell back earlier
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
    print("\nXGBoost unavailable — using RandomForest.")

pipeline = Pipeline([
    ('scaler', scaler),
    ('model', model)
])

# Train
print("\nTraining model (this may take a short while)...")
pipeline.fit(X_train, y_train)
print("Training complete.")

# Evaluate
probs = pipeline.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, probs)
preds = (probs >= 0.5).astype(int)
print(f"\nAUC on test set: {auc:.4f}")
print("\nClassification report:")
print(classification_report(y_test, preds))
print("Confusion matrix:\n", confusion_matrix(y_test, preds))

# Save model
model_path = os.path.join(MODEL_DIR, 'pipeline_model.pkl')
joblib.dump(pipeline, model_path)
print("\nSaved pipeline model to:", model_path)

# Predict helper
def predict_sample(sample_dict, df_reference=df, pipeline_ref=pipeline, X_reference=X):
    """
    sample_dict: raw feature dict (use same feature names as original dataset when possible).
    Returns float probability between 0 and 1.
    """
    sample_df = pd.DataFrame([sample_dict])
    base_df = df_reference.drop(columns=['Churn']) if 'Churn' in df_reference.columns else df_reference
    merged = pd.concat([base_df, sample_df], ignore_index=True, sort=False)
    processed_merged = preprocess_telco(merged)
    # Extract last row (our sample)
    if 'churn' in processed_merged.columns:
        Xs = processed_merged.tail(1).drop(columns=['churn'])
    else:
        Xs = processed_merged.tail(1)
    # Align columns to training features; fill missing with 0
    for col in X_reference.columns:
        if col not in Xs.columns:
            Xs[col] = 0
    Xs = Xs[X_reference.columns]
    prob = pipeline_ref.predict_proba(Xs)[0,1]
    return float(prob)

# Quick median-based sample for smoke test
sample = {}
for c in X.columns:
    if X[c].dtype.kind in 'biufc':
        sample[c] = float(X[c].median())
    else:
        sample[c] = 0

try:
    p = predict_sample(sample)
    print("\nSample prediction probability (median-based sample):", round(p,4))
except Exception as e:
    print("\npredict_sample failed:", e)

# Optional: SHAP explainability (safe-guarded)
try:
    import shap
    print("\nComputing SHAP values (may take time)...")
    wrapped = pipeline.named_steps['model']
    X_train_scaled = pipeline.named_steps['scaler'].transform(X_train)
    if HAVE_XGB:
        explainer = shap.TreeExplainer(wrapped)
        shap_values = explainer.shap_values(X_train_scaled)
        print("SHAP values computed (shape may vary by model).")
    else:
        print("Skipping TreeExplainer for non-XGBoost tree models; you can compute KernelExplainer if desired.")
except Exception as e:
    print("SHAP not run or failed:", e)

# Basic smoke-tests
errors = []
if not os.path.exists(processed_path):
    errors.append("Processed CSV missing.")
if not os.path.exists(model_path):
    errors.append("Model file missing.")
try:
    sp = predict_sample(sample)
    if not (0.0 <= sp <= 1.0):
        errors.append("predict_sample returned value outside [0,1].")
except Exception as e:
    errors.append(f"predict_sample error: {e}")

if errors:
    print("\nSMOKE TESTS: FAILED")
    for err in errors:
        print("-", err)
else:
    print("\nSMOKE TESTS: PASSED — pipeline ran end-to-end and produced a valid model & prediction.")


Data directory: /content/data/raw
Model directory: /content/models

--- Upload your Telco CSV now when the browser dialog appears ---


Saving archive (9).zip to archive (9) (1).zip
No CSV uploaded. Will create a small synthetic demo dataset at: /content/data/raw/telco_churn.csv
Synthetic dataset created.

Loaded dataset rows: 500 cols: 21

Preview (first 3 rows):


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST000000,Male,0,Yes,Yes,45,Yes,Yes,No,Yes,...,No,No,Yes,No,Month-to-month,No,Bank transfer (automatic),82.68,3742.4,No
1,CUST000001,Female,0,No,No,14,Yes,No phone service,DSL,Yes,...,No,No,Yes,No internet service,Month-to-month,No,Credit card (automatic),45.32,676.67,Yes
2,CUST000002,Male,1,No,No,24,Yes,Yes,Fiber optic,Yes,...,No internet service,Yes,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),100.37,2428.51,No



Processed data saved to: /content/data/processed/telco_processed.csv
Processed columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'churn', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']

Feature matrix shape: (500, 30)
Train shape: (400, 30) Test shape: (100, 30)

Using XGBoost model.

Training model (this may take a short while)...
Training comple