In [1]:
import kagglehub,os
from kagglehub import KaggleDatasetAdapter

file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "blastchar/telco-customer-churn",
  file_path)

print(df.shape) #7043 rows, 21 columns 

(7043, 21)


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


<h2>Checking Missing Data</h2>

In [3]:
df.isnull().sum() #no null

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [37]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure(in months)', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<h2>Identifying categorical vs numerical data</h2>

In [6]:
#categorical_cols = df.select_dtypes(include=['object']).columns
#numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [7]:
#preferred ML practice(cardinality encoding]
#categorical_cols = [c for c in df.columns if df[c].nunique() < 20]
#numerical_cols = [c for c in df.columns if df[c].nunique() >= 20]

In [8]:
#working correctly
categorical_cols = []
numerical_cols = []

for col in df.columns:
    if df[col].nunique() > 15:
        numerical_cols.append(col)
    elif df[col].nunique()<15:
        categorical_cols.append(col)
    else:
        categorical_cols.append(col)

In [9]:
categorical_cols

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [10]:
numerical_cols

['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges']

<h2>Removing Outliers</h2>

In [11]:
df = df.rename(columns={'tenure': 'tenure(in months)'})


In [12]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure(in months),PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<h2>One Hot Encoding</h2>

In [13]:
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
encoded_values = encoder.fit_transform(df[categorical_cols])


In [14]:
encoded_df = pd.DataFrame(
    encoded_values,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)

In [15]:
encoded_df.head(5)

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_No,Churn_Yes
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


def collapse_binary_ohe(df):
    df = df.copy()
    
    prefixes = (
        df.columns
        .str.split('_')
        .str[0]
        .value_counts()
    )
        # Only keep prefixes that appear exactly twice
    binary_prefixes = prefixes[prefixes == 2].index
    
    for prefix in binary_prefixes:
        cols = [c for c in df.columns if c.startswith(prefix + '_')]
        
        # Create single binary column
        df[prefix] = (
            df[cols]
            .idxmax(axis=1)
            .apply(lambda x: 1 if x.endswith(('Yes', 'Male', '1')) else 0)
        )
        
        # Drop old one-hot columns
        df.drop(columns=cols, inplace=True)
    
    return df


encoded_df_clean = collapse_binary_ohe(encoded_df)
encoded_df_clean

In [16]:
encoded_df.shape #21 columns -> 45 columns

(7043, 45)

<h1>Feature Selection and Metrics Comparsion</h1>

<div>
Random Forest:
    <ul>
    <li>Is robust to noise</li>
    <li>Handles nonlinear relationships</li>
    <li>Gives stable feature importance</li>
    <li>Works well after one-hot encoding</li>
    </ul>
</div>

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.feature_selection import SelectFromModel


rf_before=RandomForestClassifier(n_estimators=300,n_jobs=-1,max_depth=None,random_state=45)

In [29]:
X = encoded_df.drop(columns=['Churn_Yes','Churn_No'])
y = encoded_df['Churn_Yes']
rf_before.fit(X,y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=45,
    stratify=y
)

pred_before=rf_before.predict(X_test)
pred_before_train=rf_before.predict(X_train)

prob_before=rf_before.predict_proba(X_test)[:, 1]
prob_before_train=rf_before.predict_proba(X_train)[:, 1]


acc_before=accuracy_score(y_test,pred_before)
acc_before_train=accuracy_score(y_train,pred_before_train)

auc_before=roc_auc_score(y_test,prob_before)
auc_before_train=roc_auc_score(y_train,prob_before_train)

f1_before=f1_score(y_test,pred_before)
f1_before_train=f1_score(y_train,pred_before_train)

In [20]:
selector = SelectFromModel(
    rf_before,
    threshold='median',
    prefit=True
)

X_train_sel = selector.transform(X_train)
X_test_sel  = selector.transform(X_test)



In [24]:
rf_after = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_after.fit(X_train_sel, y_train)

pred_after = rf_after.predict(X_test_sel)
pred_after_train = rf_after.predict(X_train_sel)
prob_after = rf_after.predict_proba(X_test_sel)[:, 1]
prob_after_train = rf_after.predict_proba(X_train_sel)[:, 1]

acc_after = accuracy_score(y_test, pred_after)
acc_after_train = accuracy_score(y_train, pred_after_train)
auc_after = roc_auc_score(y_test, prob_after)
f1_after  = f1_score(y_test, pred_after)

In [31]:
results = pd.DataFrame({
    'Metric': ['Accuracy', 'ROC-AUC', 'F1-score'],
    'Test': [acc_before, auc_before, f1_before],
    'Train':  [acc_before_train, auc_before_train, f1_before_train]
})

results

Unnamed: 0,Metric,Test,Train
0,Accuracy,0.918382,0.918175
1,ROC-AUC,0.973599,0.97427
2,F1-score,0.844804,0.844938
