Importing libraries

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_curve, recall_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

Loading and cleaning

In [22]:
# Step 1: Load and Prepare Data
data = pd.read_csv('/content/drive/MyDrive/ds_assignment_2/Untitled folder/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Clean column names
data.columns = data.columns.str.strip()
print("Columns after cleaning:", data.columns)



Columns after cleaning: Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


features and target

In [23]:
# Verify if 'Churn' exists
if 'Churn' not in data.columns:
    raise KeyError("Target column 'Churn' not found in the dataset!")
else:
    print("'Churn' column is present.")

# Separate features and target variable
X = data.drop(['Churn', 'customerID'], axis=1)
y = data['Churn']

'Churn' column is present.


Preprocessing

In [24]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns

# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded = encoder.fit_transform(X[categorical_features])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_features))
X = pd.concat([X.drop(columns=categorical_features), encoded_df], axis=1)
# Standardize numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Feature selection
selector = SelectKBest(score_func=f_classif, k='all')
X_selected = selector.fit_transform(X_resampled, y_resampled)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_resampled, test_size=0.2, random_state=42)

# Encode target variable
y_train = y_train.map({'No': 0, 'Yes': 1})
y_test = y_test.map({'No': 0, 'Yes': 1})

Training

In [25]:

model = XGBClassifier(random_state=42, eval_metric='logloss')
model.fit(X_train, y_train)

Cross validation

In [26]:

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"Cross-validation ROC AUC scores: {cv_scores}")
print(f"Mean ROC AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")


Cross-validation ROC AUC scores: [0.93650221 0.91955147 0.94185465 0.93150931 0.93987441]
Mean ROC AUC: 0.934 (+/- 0.016)


Evaluation

In [27]:

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))


Accuracy: 0.8594202898550725
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      1021
           1       0.86      0.86      0.86      1049

    accuracy                           0.86      2070
   macro avg       0.86      0.86      0.86      2070
weighted avg       0.86      0.86      0.86      2070

AUC-ROC Score: 0.9409180330317852


Adjusting threshold

In [28]:

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
desired_recall = 0.7
optimal_threshold = thresholds[np.argmax(recall >= desired_recall)]
y_pred_adjusted = (y_pred_proba >= optimal_threshold).astype(int)
print(f"Adjusted Recall (Threshold={optimal_threshold:.2f}):", recall_score(y_test, y_pred_adjusted))

# Feature importance
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(importances)



Adjusted Recall (Threshold=0.00): 1.0
                          Feature  Importance
24              Contract_Two year    0.204464
23              Contract_One year    0.078338
9     InternetService_Fiber optic    0.042964
5816         TotalCharges_7532.15    0.032837
10             InternetService_No    0.032248
...                           ...         ...
2217         TotalCharges_2536.55    0.000000
2216         TotalCharges_2535.55    0.000000
2215          TotalCharges_2531.8    0.000000
2214          TotalCharges_2531.4    0.000000
6558           TotalCharges_999.9    0.000000

[6559 rows x 2 columns]
