In [5]:
#Importing Necessary Modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [7]:
#Load Dataset
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customer_id,gender,age,tenure,monthly_charges,total_charges,contract_type,payment_method,churn
0,1,Male,38,1,32.96,35.280731,Two-Year,Mailed Check,1
1,2,Female,49,51,87.32,4821.395573,Annual,Mailed Check,0
2,3,Male,40,45,87.71,4120.62751,Two-Year,Mailed Check,1
3,4,Male,50,4,96.42,356.81685,Annual,Credit Card,0
4,5,Male,20,62,32.37,1823.181566,Annual,Credit Card,0


In [11]:
#Display dataset info and Preview
print("Dataset Info: \n")
print(df.info())
print("\nClass Distribution: \n")
print(df['churn'].value_counts())
print("\n Sample Data:\n",df.head())

Dataset Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      500 non-null    int64  
 1   gender           500 non-null    object 
 2   age              500 non-null    int64  
 3   tenure           500 non-null    int64  
 4   monthly_charges  500 non-null    float64
 5   total_charges    500 non-null    float64
 6   contract_type    500 non-null    object 
 7   payment_method   500 non-null    object 
 8   churn            500 non-null    int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 35.3+ KB
None

Class Distribution: 

churn
0    344
1    156
Name: count, dtype: int64

 Sample Data:
    customer_id  gender  age  tenure  monthly_charges  total_charges  \
0            1    Male   38       1            32.96      35.280731   
1            2  Female   49      51            87.32    4821.395573   
2   

## Insights:
The class distribution of churn in the dataset shows 344 instances of class 0 and 156 instances of class 1, indicating an imbalance. To ensure better model performance, it is important to balance the data before training.

In [18]:
#Handle Missing Data & Modifying the data type of "total_charges" column
df['total_charges'] = pd.to_numeric(df['total_charges'],errors='coerce')
df.fillna({'total_charges':df['total_charges'].median()},inplace=True)

In [22]:
#Encode Categorical Variables
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    if column!='churn':
        df[column]=label_encoder.fit_transform(df[column])
        

In [24]:
#Encode target Variable
df['churn']=label_encoder.fit_transform(df['churn'])

In [26]:
#Scale Numerical Features
scaler = StandardScaler()
numerical_features = ['tenure','monthly_charges','total_charges']
df[numerical_features]=scaler.fit_transform(df[numerical_features])


In [28]:
#Features & Target
X = df.drop(columns=['churn'])
y = df['churn']

In [30]:
#Split Dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [60]:
#Applying SMOTE
smote = SMOTE(random_state=42)
X_train_resampled,y_train_resampled = smote.fit_resample(X_train,y_train)


In [62]:
# Display class distribution after SMOTE
print("\n Class Distribution after SMOTE: \n")
print(pd.Series(y_train_resampled).value_counts())


 Class Distribution after SMOTE: 

churn
0    274
1    274
Name: count, dtype: int64


In [44]:
#Train Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled,y_train_resampled)
y_pred_rf = rf_model.predict(X_test)
roc_auc_rf = roc_auc_score(y_test,rf_model.predict_proba(X_test)[:,1])

#Classification Report 
print("Random Forest Model Performance: \nROC AUC Scores: ",roc_auc_rf)
print("Random Forest Report: \n",classification_report(y_test,y_pred_rf))

Random Forest Model Performance: 
ROC AUC Scores:  0.4747619047619048
Random Forest Report: 
               precision    recall  f1-score   support

           0       0.68      0.77      0.72        70
           1       0.20      0.13      0.16        30

    accuracy                           0.58       100
   macro avg       0.44      0.45      0.44       100
weighted avg       0.53      0.58      0.55       100



In [46]:
#Train XGBoost
xgb_model = XGBClassifier(eval_metric='logloss',random_state=42)
xgb_model.fit(X_train_resampled,y_train_resampled)
y_pred_xgb = xgb_model.predict(X_test)
roc_auc_xgb = roc_auc_score(y_test,xgb_model.predict_proba(X_test)[:,1])

#Classification Report 
print("XGBoost Model Performance: \nROC AUC Scores: ",roc_auc_xgb)
print("XGBoost Model  Report: \n",classification_report(y_test,y_pred_xgb))

XGBoost Model Performance: 
ROC AUC Scores:  0.4542857142857143
XGBoost Model  Report: 
               precision    recall  f1-score   support

           0       0.68      0.77      0.72        70
           1       0.20      0.13      0.16        30

    accuracy                           0.58       100
   macro avg       0.44      0.45      0.44       100
weighted avg       0.53      0.58      0.55       100



In [64]:
#Train LightGBM
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(X_train_resampled,y_train_resampled)
y_pred_lgb = lgb_model.predict(X_test)
roc_auc_lgb = roc_auc_score(y_test,lgb_model.predict_proba(X_test)[:,1])

#Classification Report 
print("LightGBM Model Performance: \nROC AUC Scores: ",roc_auc_lgb)
print("XGBoost Model  Report: \n",classification_report(y_test,y_pred_lgb))


[LightGBM] [Info] Number of positive: 274, number of negative: 274
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000377 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 548, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM Model Performance: 
ROC AUC Scores:  0.49619047619047624
XGBoost Model  Report: 
               precision    recall  f1-score   support

           0       0.68      0.70      0.69        70
           1       0.25      0.23      0.24        30

    accuracy                           0.56       100
   macro avg       0.47      0.47      0.47       100
weighted avg       0.55      0.56      0.56       100

