In [18]:
# Standard libraries pandas and numpy
import pandas as pd
import numpy as np

# scikit-learn libraries 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# libraries for smote and xgboost
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
# Load the dataset
df = pd.read_csv('dataset_task1.csv')

In [19]:
# 1. Performing appropriate preprocessing and classification on the given dataset.  

#basic eda od reading the data

print("--- Data Head ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())

#we drop the duplicate rows
df = df.drop_duplicates()

# Convert totalcharges to numeric as some can be empty string vals, and we "coerce"/force the errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#print the null values
print(df.isnull().sum())

#updated info after converting totalcharges to numeric
print("UPDATED Data Info:")
print(df.info())

print("\n--- Target Class Imbalance ---")
print(df['Churn'].value_counts(normalize=True))

--- Data Head ---
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies   

In [20]:
# 2. Identify if the data is imbalanced and apply appropriate measures to handle it like SMOTE and feature extraction.  

# Drop customerID as it's not a feature that would contribute to predict churn
X = df.drop(['Churn', 'customerID'], axis=1)  

# Convert 'Yes'/'No' to 1/0
y = df['Churn'].map({'Yes': 1, 'No': 0})      

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")

X_train shape: (5634, 19)


In [21]:
# 3 Perform Imputation based on Imputers available on Scikit Learn.Use bagging and boosting techniques. Perform LDA and QDA on the dataset.

categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                        'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

#Imputation is similar to fillna - its a subsitution technique for missing values and prevents data leakage
num_imputer_mean = SimpleImputer(strategy='mean') # Use the mean for missing numbers
num_imputer_median = SimpleImputer(strategy='median') # Use the median(meadians are less response to outliers so better when we have many outliers)
cat_imputer = SimpleImputer(strategy='most_frequent') # Use mode

#defining the models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),  # bagging- tree-based multiple trees and voting technique
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'), # boosting tree based, next tree better than the previous
    'LDA': LinearDiscriminantAnalysis(solver='lsqr', shrinkage=0.1), #straight line boundary to separate classes, assuming all classes share the same covariance
    'QDA': QuadraticDiscriminantAnalysis() # curved decision boundary and  own unique covariance.
}

# number of trees 
param_grid = {'classifier__n_estimators': [50, 100]}


# Now, we are using two methods, one is using mean and standard scaler, the other is using median and minmax scaler
preprocessors = {
    # Strategy 1: Using the average for missing numbers, and making sure the numeric data has a mean of 0 and std dev of 1.
    'Mean_Standard': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_mean), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough'),

    # Strategy 2: Using the median for missing numbers (better against outliers), and scaling data between 0 and 1.
    'Median_MinMax': ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer_median), ('scaler', MinMaxScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
    ], remainder='passthrough')
}

In [22]:
# 4. Compare different approaches to scaling and imputing and analyze how they affect the model performance. Â 
# Evaluate the model using multiple metrics and justify which ones are most appropriate to use in the given scenario.

results = {}
for prep_name, prep in preprocessors.items():
    # Looping through the models
    for model_name, model in models.items():
        
        # Build the full pipeline: Preprocessor -> SMOTE -> Classifier
        pipeline = ImbPipeline([
            ('preprocessor', prep),
            ('smote', SMOTE(random_state=42)), #smote to handle class imbalance
            ('classifier', model)
        ])
        
        # Use GridSearchCV for hyperparameter tuning 
        # We use F1 score as the primary metric for tuning because the classes are imbalanced.
        grid = GridSearchCV(
            pipeline, 
            param_grid if model_name in ['RandomForest', 'XGBoost'] else {}, 
            cv=3, 
            scoring='f1', # Optimize for F1 score, which balances Precision and Recall
            n_jobs=-1 
        )
        
        # Train the model 
        grid.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = grid.predict(X_test)

        # Receiver Operating Characteristic - Area Under the Curve (ROC-AUC) is a performance metric for binary classification -Yes churn no Chrun
        y_pred_proba = grid.predict_proba(X_test)[:, 1] if hasattr(grid.best_estimator_['classifier'], 'predict_proba') else None

        # performance metrics 
        metrics = {
            'Best Params': grid.best_params_ if model_name in ['RandomForest', 'XGBoost'] else {'classifier': 'Default'},
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 'N/A'
        }
        results[(prep_name, model_name)] = metrics

print("\n--- Model Training Complete ---")




--- Model Training Complete ---




In [23]:
# 5. Final Results and Analysis

# Converting to dataframe for ease of viewing
results_df = pd.DataFrame(results).T
print("\n--- Combined Model Performance (Sorted by F1-Score) ---")
# Displaying the results, sorted by F1-Score to find the best balance of Precision and Recall.
print(results_df.sort_values(by='F1', ascending=False))

# Analysis:-
# Recall is critical in this model because a False Negative -a customer predicted as No, but the customer infact does then churn, means losing a customer
# Precision- A False Positive is unnecessary
# F1-Score is the central optimization metric-balance between precision and recall
# ROC-AUC is used to reliably compare models across all thresholds, as it is robust to the dataset's inherent class imbalance, confirming the overall discriminating power of the model.

#mean_standard lda is the best model as it has highest f1 score and roc 


--- Combined Model Performance (Sorted by F1-Score) ---
                                                  Best Params  Accuracy  \
Mean_Standard LDA                   {'classifier': 'Default'}  0.736693   
Median_MinMax LDA                   {'classifier': 'Default'}  0.734564   
              XGBoost        {'classifier__n_estimators': 50}  0.775727   
Mean_Standard XGBoost        {'classifier__n_estimators': 50}  0.775727   
Median_MinMax RandomForest   {'classifier__n_estimators': 50}  0.776437   
Mean_Standard RandomForest  {'classifier__n_estimators': 100}   0.77005   
              QDA                   {'classifier': 'Default'}  0.603974   
Median_MinMax QDA                   {'classifier': 'Default'}  0.452803   

                           Precision    Recall        F1   ROC-AUC  
Mean_Standard LDA           0.502618  0.770053  0.608237  0.835676  
Median_MinMax LDA                0.5  0.775401  0.607966   0.83309  
              XGBoost       0.572139  0.614973  0.592784  0.