In [1]:
#import libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

#load the dataset
df = pd.read_csv('dataset_task1.csv')


In [2]:
# 1. Small EDA and Data Cleaning

print("--- Data Head ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())

print(f"\nDuplicates before dropping: {df.duplicated().sum()}")
df = df.drop_duplicates()

# Coerce 'TotalCharges' to numeric, forcing errors to NaN (missing values)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print("\n--- Missing Values after Coercion (TotalCharges) ---")
print(df.isnull().sum())

print("\n--- Target Class Imbalance ---")
print(df['Churn'].value_counts(normalize=True))

--- Data Head ---
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies   

In [3]:
# 2. Feature/Target Split and Data Split

# Drop customerID as it's not a feature
X = df.drop(['Churn', 'customerID'], axis=1)  

# Convert 'Yes'/'No' to 1/0
y = df['Churn'].map({'Yes': 1, 'No': 0})      

# Stratified split to maintain class ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")

X_train shape: (5634, 19)


In [4]:
# 3 Define Components: Features, Models, Preprocessors

categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Define imputation strategies
num_imputer_mean = SimpleImputer(strategy='mean')
num_imputer_median = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Models: Define classifiers (FIXED LDA for shrinkage compatibility)
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    # 'lsqr' solver supports shrinkage, which is beneficial for LDA
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.1), 
    'QDA': QuadraticDiscriminantAnalysis()
}

#  Hyperparameter tuning grid (for tree models only)
param_grid = {'classifier__n_estimators': [50, 100]}
# Define Preprocessing Pipelines (ColumnTransformers)
preprocessors = {
    # Strategy 1: Mean Imputation + Standard Scaling
    'Mean_Standard': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_mean), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough'),
    
    # Strategy 2: Median Imputation + MinMax Scaling
    'Median_MinMax': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_median), ('scaler', MinMaxScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough')
}

In [5]:
# 4. Training, Hyperparameter Tuning, and Evaluation Loop

results = {}

for prep_name, prep in preprocessors.items():
    for model_name, model in models.items():
        print(f"Training: {prep_name} + {model_name}...")
        
        # Build the full pipeline: Preprocessor -> SMOTE -> Classifier
        pipeline = ImbPipeline([
            ('preprocessor', prep), 
            ('smote', SMOTE(random_state=42)), 
            ('classifier', model)
        ])
        
        # Use GridSearchCV for hyperparameter tuning (only for RandomForest/XGBoost)
        grid = GridSearchCV(
            pipeline, 
            param_grid if model_name in ['RandomForest', 'XGBoost'] else {}, 
            cv=3, 
            scoring='f1',
            n_jobs=-1  # Use all cores
        )
        
        # Train the model
        grid.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = grid.predict(X_test)
        
        # Get probabilities for ROC-AUC
        y_pred_proba = grid.predict_proba(X_test)[:, 1] if hasattr(grid.best_estimator_['classifier'], 'predict_proba') else None
        
        # Calculate metrics
        metrics = {
            'Best Params': grid.best_params_ if model_name in ['RandomForest', 'XGBoost'] else {'classifier': 'Default'},
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 'N/A'
        }
        results[(prep_name, model_name)] = metrics

print("\n--- Model Training Complete ---")

Training: Mean_Standard + RandomForest...
Training: Mean_Standard + XGBoost...
Training: Mean_Standard + LDA...
Training: Mean_Standard + QDA...




Training: Median_MinMax + RandomForest...
Training: Median_MinMax + XGBoost...
Training: Median_MinMax + LDA...
Training: Median_MinMax + QDA...

--- Model Training Complete ---




In [6]:
# 5. Final Results and Analysis

results_df = pd.DataFrame(results).T
print("\n--- Combined Model Performance (Sorted by F1-Score) ---")
print(results_df.sort_values(by='F1', ascending=False))

# Analysis:
# Recall is key for churn to catch at-risk customers, F1 balances precision/recall, 
# AUC handles imbalance well. The model with the highest F1 and ROC-AUC is typically the best choice.


--- Combined Model Performance (Sorted by F1-Score) ---
                                                  Best Params  Accuracy  \
Mean_Standard LDA                   {'classifier': 'Default'}  0.736693   
Median_MinMax LDA                   {'classifier': 'Default'}  0.734564   
              XGBoost        {'classifier__n_estimators': 50}  0.775727   
Mean_Standard XGBoost        {'classifier__n_estimators': 50}  0.775727   
Median_MinMax RandomForest   {'classifier__n_estimators': 50}  0.776437   
Mean_Standard RandomForest  {'classifier__n_estimators': 100}   0.77005   
              QDA                   {'classifier': 'Default'}  0.603974   
Median_MinMax QDA                   {'classifier': 'Default'}  0.452803   

                           Precision    Recall        F1   ROC-AUC  
Mean_Standard LDA           0.502618  0.770053  0.608237  0.835676  
Median_MinMax LDA                0.5  0.775401  0.607966   0.83309  
              XGBoost       0.572139  0.614973  0.592784  0.