In [218]:
#import the required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline



from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split


# Exploratory Data Analysis & Feature Engineering

In [219]:
#Data Size : How many total data entries (observations) are present? How many features (variables) are there, including the target column
df = pd.read_csv("../telco_train.csv")
#data_frame.shape retruns the tuple and [0] returns the element of the tuples correspondingly.
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])
df.head()

Rows: 7043
Columns: 21


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [220]:
#Separate Features & Target
X = df.drop(columns=['Churn'])
#Target Encoding . Convert target variable Churn.
print("Before Target Encoding : ", df['Churn'].value_counts())
y = df['Churn'].map({'Yes': 1, 'No': 0})
print("After Target Encoding : ",y.value_counts())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Split Training Set and Test Set
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Before Target Encoding :  Churn
No     5174
Yes    1869
Name: count, dtype: int64
After Target Encoding :  Churn
0    5174
1    1869
Name: count, dtype: int64
X_train shape: (5634, 20)
X_test shape: (1409, 20)
y_train shape: (5634,)
y_test shape: (1409,)


In [221]:
#df = X_train

In [222]:
#Examine the missing values in the dataset size.A cell with no data is called “missing.”
print("Missing values per column:")
print(X_train.isnull().sum())
#There are no columns with Missing Values as per the recieved output

Missing values per column:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64


In [223]:
#Feature Types: What kind of data does each column have?
print("Column types:")
print(X_train.dtypes)
#Feature Types: More detailed view to know what kind of data does each column have? Data --> Numerical or Categorical
categorical_cols = []
numerical_cols = []
numerical_disguised_as_object = []
id_cols = []

n_rows = len(X_train)

for col in X_train.columns:

    # Detect ID-like columns
    if X_train[col].dtype == 'object' and X_train[col].nunique() / n_rows > 0.95:
        id_cols.append(col)
        continue

    # Proper numeric columns
    if X_train[col].dtype in ['int64', 'float64']:
        if X_train[col].nunique() < 10:
            categorical_cols.append(col)
        else:
            numerical_cols.append(col)

    # Object but numeric
    elif X_train[col].dtype == 'object':
        converted = pd.to_numeric(X_train[col], errors='coerce')
        if converted.notna().mean() > 0.9:
            numerical_cols.append(col)
            numerical_disguised_as_object.append(col)
        else:
            categorical_cols.append(col)

    # Other
    else:
        categorical_cols.append(col)

print("ID Columns:", id_cols)
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)
print("Numerical but stored as object:", numerical_disguised_as_object)




Column types:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
dtype: object
ID Columns: ['customerID']
Numerical Columns: ['tenure', 'MonthlyCharges', 'TotalCharges']
Categorical Columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical but stored as object: ['TotalChar

In [224]:
#Detect Semantic Missing Values. These are the missing values which cannot be detected using df.isnull().sum()
semantic_missing = ["", " ", "NA", "N/A", "null", "None", "unknown", "?"]
X_train = X_train.replace(semantic_missing, np.nan)
print("Missing values per column:")
print(X_train.isnull().sum())

Missing values per column:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        8
dtype: int64


In [225]:
# Based on Data Analysis,  we found the ID Columns: ['customerID']. Remove Irrelevant Columns
print("Number of Features Before:", X_train.shape[1])
X_train.drop(columns=['customerID'], inplace=True)
print("Number of Features After:", X_train.shape[1])

Number of Features Before: 20
Number of Features After: 19


In [226]:
# Based on Data Analysis, Total Charges Column was not detected as Numerical but stored as object: ['TotalCharges'].
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'])
print("Column types:")
print(X_train.dtypes)

Column types:
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object


In [227]:
#Detect duplicate rows. 
X_train.duplicated().sum()

np.int64(26)

In [228]:
#Keeping the duplicated can lead to overfitting, as the model might give "extra weight" to these specific profiles just because they appear more than once
X_train.drop_duplicates(inplace=True)
X_train.duplicated().sum()

np.int64(0)

In [229]:
X_train.describe()
#Columns with large scales will required standardization.

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,5608.0,5608.0,5608.0,5600.0
mean,0.164051,32.631063,65.084353,2313.14808
std,0.370355,24.531696,30.09182,2279.186379
min,0.0,0.0,18.4,18.85
25%,0.0,9.0,36.0,415.8125
50%,0.0,29.0,70.6,1411.775
75%,0.0,56.0,90.05,3849.4625
max,1.0,72.0,118.75,8684.8


In [230]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5608 entries, 3738 to 5639
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5608 non-null   object 
 1   SeniorCitizen     5608 non-null   int64  
 2   Partner           5608 non-null   object 
 3   Dependents        5608 non-null   object 
 4   tenure            5608 non-null   int64  
 5   PhoneService      5608 non-null   object 
 6   MultipleLines     5608 non-null   object 
 7   InternetService   5608 non-null   object 
 8   OnlineSecurity    5608 non-null   object 
 9   OnlineBackup      5608 non-null   object 
 10  DeviceProtection  5608 non-null   object 
 11  TechSupport       5608 non-null   object 
 12  StreamingTV       5608 non-null   object 
 13  StreamingMovies   5608 non-null   object 
 14  Contract          5608 non-null   object 
 15  PaperlessBilling  5608 non-null   object 
 16  PaymentMethod     5608 non-null   object 
 1

In [231]:
#Handle Missing Values Properly. With the detection of Semantic Missing Values mechanism, Total Charges has 11 missing column. Filling the missing value with the median value
X_train['TotalCharges'] = X_train['TotalCharges'].fillna(X_train['TotalCharges'].median())
print("Missing values per column:")
print(X_train.isnull().sum())

Missing values per column:
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64


In [232]:
print ("Categorical Columns : ", categorical_cols)
print ("Numerical Columns : ", numerical_cols)

Categorical Columns :  ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical Columns :  ['tenure', 'MonthlyCharges', 'TotalCharges']


In [233]:
#Identify Binary Categorical Columns for Label Encoding and Multiclass Categorical Columns for One Hot Encoding.
binary_categorical_cols = []
multiclass_categorical_cols = []
for col in categorical_cols:
    unique_vals = X[col].nunique()    
    if unique_vals == 2:
        binary_categorical_cols.append(col)
    elif unique_vals > 2:
        multiclass_categorical_cols.append(col)
print ("Binary Categorical Columns : ", binary_categorical_cols)
print ("Multiclass Categorical Columns : ", multiclass_categorical_cols)

Binary Categorical Columns :  ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
Multiclass Categorical Columns :  ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']


In [234]:
#Encode Binary Categorical Features
label_encoders = {}
for col in binary_categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le
print ("Completed Label Encoding for the Binary Categorical Columns.")
#One Hot Encoding
X_train = pd.get_dummies(X_train, columns=multiclass_categorical_cols, drop_first=True, dtype=int)
print ("Completed OHE Encoding for the Multiclass Categorical Columns.")



Completed Label Encoding for the Binary Categorical Columns.
Completed OHE Encoding for the Multiclass Categorical Columns.


In [235]:

print("Rows:", X_train.shape[0])
print("Columns:", X_train.shape[1])
X_train.head()

Rows: 5608
Columns: 30


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No phone service,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3738,1,0,0,0,35,0,0,49.2,1701.65,1,...,0,0,1,0,1,0,0,0,1,0
3151,1,0,1,1,15,1,0,75.1,1151.55,0,...,0,0,0,0,0,0,0,0,0,1
4860,1,0,1,1,13,0,0,40.55,590.35,1,...,1,0,0,0,0,0,1,0,0,1
3867,0,0,1,0,26,1,1,73.5,1905.7,0,...,0,0,1,0,1,0,1,1,0,0
3810,1,0,1,1,1,1,0,44.55,44.55,0,...,0,0,0,0,0,0,0,0,1,0


In [237]:
#remove high correlated features
corr_matrix = X_train.corr().abs()

upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

high_corr_features = [
    col for col in upper_triangle.columns
    if any(upper_triangle[col] > 0.8)
]

print ("High Correlated features are ", high_corr_features)

X_train.drop(columns=high_corr_features, inplace=True)


High Correlated features are  ['TotalCharges', 'MultipleLines_No phone service', 'OnlineSecurity_No internet service', 'OnlineBackup_No internet service', 'DeviceProtection_No internet service', 'TechSupport_No internet service', 'StreamingTV_No internet service', 'StreamingMovies_No internet service']


In [238]:
X_train.info()
print ("Binary Categorical Columns : ", binary_categorical_cols)
print ("Multiclass Categorical Columns : ", multiclass_categorical_cols)

<class 'pandas.core.frame.DataFrame'>
Index: 5608 entries, 3738 to 5639
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 5608 non-null   int64  
 1   SeniorCitizen                          5608 non-null   int64  
 2   Partner                                5608 non-null   int64  
 3   Dependents                             5608 non-null   int64  
 4   tenure                                 5608 non-null   int64  
 5   PhoneService                           5608 non-null   int64  
 6   PaperlessBilling                       5608 non-null   int64  
 7   MonthlyCharges                         5608 non-null   float64
 8   MultipleLines_Yes                      5608 non-null   int64  
 9   InternetService_Fiber optic            5608 non-null   int64  
 10  InternetService_No                     5608 non-null   int64  
 11  Online

In [242]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)
# ===============================
# 3. Stratified Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# ===============================
# 4. Feature Scaling (ONLY continuous)
# ===============================
continuous_cols = ['tenure', 'MonthlyCharges']

scaler = StandardScaler()
X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])

# ===============================
# 5. Train Baseline Model
# ===============================
log_reg = LogisticRegression(
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_train, y_train)

# ===============================
# 6. Predictions
# ===============================
y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# ===============================
# 7. Evaluation
# ===============================
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC-AUC Score:", roc_auc)

Train shape: (5634, 20)
Test shape : (1409, 20)


ValueError: could not convert string to float: '4950-BDEUX'