In [None]:
# Standard libraries pandas and numpy
import pandas as pd
import numpy as np

# scikit-learn libraries 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# libraries for smote and xgboost
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
# Load the dataset
df = pd.read_csv('dataset_task1.csv')

In [None]:
# 1. Performing appropriate preprocessing and classification on the given dataset.  

#basic eda od reading the data

print("--- Data Head ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())

#we drop the duplicate rows
df = df.drop_duplicates()

# Convert totalcharges to numeric as some can be empty string vals, and we "coerce"/force the errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#print the null values
print(df.isnull().sum())

#updated info after converting totalcharges to numeric
print("UPDATED Data Info:")
print(df.info())

print("\n--- Target Class Imbalance ---")
print(df['Churn'].value_counts(normalize=True))

In [None]:
# 2. Identify if the data is imbalanced and apply appropriate measures to handle it like SMOTE and feature extraction.  

# Drop customerID as it's not a feature that would contribute to predict churn
X = df.drop(['Churn', 'customerID'], axis=1)  

# Convert 'Yes'/'No' to 1/0
y = df['Churn'].map({'Yes': 1, 'No': 0})      

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")

In [None]:
# 3 Perform Imputation based on Imputers available on Scikit Learn.

categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

#Imputation is similar to fillna - its a subsitution technique for missing values and prevents data leakage
num_imputer_mean = SimpleImputer(strategy='mean') # Use the mean for missing numbers
num_imputer_median = SimpleImputer(strategy='median') # Use the median(meadians are less response to outliers so better when we have many outliers)
cat_imputer = SimpleImputer(strategy='most_frequent') # Use mode

#defining the models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),  # bagging- tree-based multiple trees and voting technique
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'), # boosting tree based, next tree better than the previous
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.1), #straight line boundary to separate classes, assuming all classes share the same covariance
    'QDA': QuadraticDiscriminantAnalysis() # curved decision boundary and  own unique covariance.
}

# number of trees 
param_grid = {'classifier__n_estimators': [50, 100]}


# Now, we are using two methods, one is using mean and standard scaler, the other is using median and minmax scaler
preprocessors = {
    # Strategy 1: Using the average for missing numbers, and making sure the numeric data has a mean of 0 and std dev of 1.
    'Mean_Standard': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_mean), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough'),

    # Strategy 2: Using the median for missing numbers (better against outliers), and scaling data between 0 and 1.
    'Median_MinMax': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_median), ('scaler', MinMaxScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough')
}

In [None]:
# 4. Training, Hyperparameter Tuning, and Evaluation Loop

results = {}

for prep_name, prep in preprocessors.items():
    for model_name, model in models.items():
        print(f"Training: {prep_name} + {model_name}...")
        
        # Build the full pipeline: Preprocessor -> SMOTE -> Classifier
        pipeline = ImbPipeline([
            ('preprocessor', prep), 
            ('smote', SMOTE(random_state=42)), 
            ('classifier', model)
        ])
        
        # Use GridSearchCV for hyperparameter tuning (only for RandomForest/XGBoost)
        grid = GridSearchCV(
            pipeline, 
            param_grid if model_name in ['RandomForest', 'XGBoost'] else {}, 
            cv=3, 
            scoring='f1',
            n_jobs=-1  # Use all cores
        )
        
        # Train the model
        grid.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = grid.predict(X_test)
        
        # Get probabilities for ROC-AUC
        y_pred_proba = grid.predict_proba(X_test)[:, 1] if hasattr(grid.best_estimator_['classifier'], 'predict_proba') else None
        
        # Calculate metrics
        metrics = {
            'Best Params': grid.best_params_ if model_name in ['RandomForest', 'XGBoost'] else {'classifier': 'Default'},
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 'N/A'
        }
        results[(prep_name, model_name)] = metrics

print("\n--- Model Training Complete ---")

In [None]:
# 5. Final Results and Analysis

results_df = pd.DataFrame(results).T
print("\n--- Combined Model Performance (Sorted by F1-Score) ---")
print(results_df.sort_values(by='F1', ascending=False))

# Analysis:
# Recall is key for churn to catch at-risk customers, F1 balances precision/recall, 
# AUC handles imbalance well. The model with the highest F1 and ROC-AUC is typically the best choice.