In [37]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.preprocessing import OrdinalEncoder

# Add project root to sys.path
sys.path.append(os.path.abspath("..")) # go one level up

from src.utils.paths import CLEAN_DATA_DIR
from src.utils.io import load_csv

df = load_csv(CLEAN_DATA_DIR / "cleaned_telco_churn.csv")
df_fe = df.copy()

In [16]:
pd.set_option('display.max_columns', None)
df_fe.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_zscore,tenure_outlier_z,MonthlyCharges_zscore,MonthlyCharges_outlier_z,TotalCharges_zscore,TotalCharges_outlier_z,tenure_outlier_iqr,MonthlyCharges_outlier_iqr,TotalCharges_outlier_iqr,outlier_iso
6511,5135-RDDQL,Female,0,Yes,Yes,1.247218,Yes,No,DSL,Yes,No,No,No,No,No,Two year,Yes,Bank transfer (automatic),-0.469015,0.414697,No,1.247218,0,-0.469015,0,0.414697,0,0,0,0,0
1899,8563-OYMQY,Male,0,No,No,-0.625919,Yes,No,Fiber optic,No,Yes,Yes,No,No,Yes,Month-to-month,Yes,Credit card (automatic),0.870393,-0.295666,No,-0.625919,0,0.870393,0,-0.295666,0,0,0,0,0
6505,7018-FPXHH,Male,0,Yes,Yes,0.962175,Yes,No,DSL,Yes,Yes,Yes,No,No,No,Two year,Yes,Bank transfer (automatic),-0.164907,0.518974,No,0.962175,0,-0.164907,0,0.518974,0,0,0,0,0
1091,5832-TRLPB,Male,0,No,No,-0.137274,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,Yes,Bank transfer (automatic),0.351912,-0.016784,No,-0.137274,0,0.351912,0,-0.016784,0,0,0,0,0
4377,8212-CRQXP,Female,0,Yes,No,-0.422317,Yes,No,Fiber optic,No,No,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),0.506459,-0.254057,No,-0.422317,0,0.506459,0,-0.254057,0,0,0,0,0


#### **1. Feature Construction**

**1.1 Total Service Count:**

In [19]:
service_cols = [
    'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

df_fe['service_count'] = df_fe[service_cols].eq('Yes').sum(axis=1)

**1.2 Monthly Charge Buckets**

In [21]:
df_fe['charges_bucket'] = pd.cut(
    df_fe['MonthlyCharges'],
    bins=[0,35, 70, 100, 150],
    labels=['Low', 'Medium', 'High', 'Very High']
)

**1.3 Total charges per tenure ratio**

In [23]:
# For each customer, if tenure is not 0, divide TotalCharges by tenure and store the result in charge_per_month.
# If tenure is 0, set charge_per_month to 0.

df_fe['charges_per_month'] = np.where(
    df_fe['tenure'] !=0,
    df_fe['TotalCharges'] / df_fe['tenure'],
    0
)

**1.4 Active User (Internet + Streaming)**

In [24]:
# 5. Is active user (internet + streaming)
# Mark 1 if customer has InternetService and subscribes to StreamingTV, else 0
df_fe['is_active_user'] = ((df_fe['InternetService'] != 'No') & (df_fe['StreamingTV'] == 'Yes')).astype(int)

**1.5 Senior Citizen**

In [26]:
# Senior citizen as binary
# Convert SeniorCitizen column to 1 if senior (1), else 0
df_fe['is_senior'] = df_fe['SeniorCitizen'].astype(int)


**1.6 Tenure Squared**

In [27]:
# Tenure squared (classic interaction feature)
# Create a new feature as the square of tenure
df_fe['tenure_sqr'] = df_fe['tenure']**2

**1.7  High monthly charge flag**

In [28]:
# Flag high bills: 1 if MonthlyCharges > median, else 0
df_fe['high_bill_flag'] = (df_fe['MonthlyCharges'] > df_fe['MonthlyCharges'].median()).astype(int)

In [30]:
df_fe.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_zscore,tenure_outlier_z,MonthlyCharges_zscore,MonthlyCharges_outlier_z,TotalCharges_zscore,TotalCharges_outlier_z,tenure_outlier_iqr,MonthlyCharges_outlier_iqr,TotalCharges_outlier_iqr,outlier_iso,charges_per_month,is_active_user,is_senior,tenure_sqr,high_bill_flag
107,9750-BOOHV,Female,0,No,No,-0.015113,No,No phone service,DSL,Yes,No,No,No,No,No,One year,No,Mailed check,-1.150352,-0.597881,No,-0.015113,0,-1.150352,0,-0.597881,0,0,0,0,0,39.55993,0,0,0.000228,0
1319,1725-MIMXW,Male,0,No,Yes,-1.277445,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,-1.505977,-0.998833,Yes,-1.277445,0,-1.505977,0,-0.998833,0,0,0,0,0,0.7819,0,0,1.631865,0
5393,5376-DEQCP,Female,0,No,No,-1.277445,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,0.194042,-0.976252,Yes,-1.277445,0,0.194042,0,-0.976252,0,0,0,0,0,0.764222,0,0,1.631865,1
762,7379-POKDZ,Male,0,Yes,No,-1.196004,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,0.184071,-0.915217,Yes,-1.196004,0,0.184071,0,-0.915217,0,0,0,0,0,0.765229,0,0,1.430425,0
250,4075-JFPGR,Female,0,Yes,No,0.758574,Yes,No,Fiber optic,Yes,Yes,Yes,No,Yes,No,One year,Yes,Electronic check,0.955145,1.032018,No,0.758574,0,0.955145,0,1.032018,0,0,0,0,0,1.360471,1,0,0.575434,1


##### **2. Dropping and Mapping Columns**

In [42]:
# Dropping column = customerID not required for modeling.
df_fe = df_fe.drop(columns=['customerID'], errors='ignore')

In [43]:
# Mapping target Variable
# Churn: No -> 0,  Yes -> 1
df_fe['Churn'] = df_fe['Churn'].map({'No':0, 'Yes':1})

##### **3. Identifying Categorical columns**

In [49]:
# identifying categorical columns in dataset.
# categorical_cols = df_fe.select_dtypes(include=['object']).columns.tolist()
# print("Categorical columns:", categorical_cols)

# Ordinal columns (have natural order)
ordinal_cols = ['Contract']

# Nominal columns (no order, remaining object type columns)
nominal_cols = [c for c in df_fe.select_dtypes(include=['object']).columns if c not in ordinal_cols + ['Churn']]
print("Ordinal columns:", ordinal_cols)
print("Nominal columns:", nominal_cols)

Ordinal columns: ['Contract']
Nominal columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod']


##### **4. Ordinal Encoding**

In [None]:
# Ordinal column order
ordinal_mapping = {
    'Contract': ['Month-to-month', 'One year', 'Two year']
}

# Encode ordinal columns to numbers using their defined order
# Resulting numeric values are stored in a new column named '<original_column>_encoded'
for col, cat_order in ordinal_mapping.items():
    encoder = OrdinalEncoder(categories=[cat_order])
    df_fe[col + '_encoded'] = encoder.fit_transform(df_fe[[col]])




# Dropping original ordinal column after encoding
df_fe = df_fe.drop(columns=ordinal_cols)

##### **5. Nominal / One-Hot Encoding**

In [51]:
# One-Hot Encode all nominal columns, drop first category to avoid redundancy
df_fe = pd.get_dummies(df_fe, columns=nominal_cols, drop_first=True)


print("Columns after encoding:")
print(df_fe.columns)

print("\nSample of first 5 rows:")
display(df_fe.head())

print("\nCheck for missing values:")
print(df_fe.isna().sum())

Columns after encoding:
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'tenure_zscore', 'tenure_outlier_z', 'MonthlyCharges_zscore',
       'MonthlyCharges_outlier_z', 'TotalCharges_zscore',
       'TotalCharges_outlier_z', 'tenure_outlier_iqr',
       'MonthlyCharges_outlier_iqr', 'TotalCharges_outlier_iqr', 'outlier_iso',
       'Contract_encoded', 'gender_Male', 'Partner_Yes', 'Dependents_Yes',
       'PhoneService_Yes', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No internet service',
       'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'PaperlessBilling_Yes

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,tenure_zscore,tenure_outlier_z,MonthlyCharges_zscore,MonthlyCharges_outlier_z,TotalCharges_zscore,TotalCharges_outlier_z,tenure_outlier_iqr,MonthlyCharges_outlier_iqr,TotalCharges_outlier_iqr,outlier_iso,Contract_encoded,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,-1.277445,-1.160323,-0.994242,,-1.277445,0,-1.160323,0,-0.994242,0,0,0,0,0,0.0,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False
1,0,0.066327,-0.259629,-0.173244,,0.066327,0,-0.259629,0,-0.173244,0,0,0,0,0,1.0,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True
2,0,-1.236724,-0.36266,-0.959674,,-1.236724,0,-0.36266,0,-0.959674,0,0,0,0,0,0.0,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False,True
3,0,0.514251,-0.746535,-0.194766,,0.514251,0,-0.746535,0,-0.194766,0,0,0,0,0,1.0,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False
4,0,-1.236724,0.197365,-0.94047,,-1.236724,0,0.197365,0,-0.94047,0,0,0,0,0,0.0,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False



Check for missing values:
SeniorCitizen                               0
tenure                                      0
MonthlyCharges                              0
TotalCharges                                0
Churn                                    7043
tenure_zscore                               0
tenure_outlier_z                            0
MonthlyCharges_zscore                       0
MonthlyCharges_outlier_z                    0
TotalCharges_zscore                         0
TotalCharges_outlier_z                      0
tenure_outlier_iqr                          0
MonthlyCharges_outlier_iqr                  0
TotalCharges_outlier_iqr                    0
outlier_iso                                 0
Contract_encoded                            0
gender_Male                                 0
Partner_Yes                                 0
Dependents_Yes                              0
PhoneService_Yes                            0
MultipleLines_No phone service              0
Multipl