# Imports

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [61]:
pd.set_option('display.max_columns', None)

# Load Dataset & Quick Scan

In [62]:
import kagglehub
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

In [63]:
filename = os.listdir(path)[0]
fp = os.path.join(path, filename)
df = pd.read_csv(fp)

In [64]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [65]:
# df.shape
# df.columns.tolist()
# df.dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [66]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
customerID,7043.0,7043.0,7590-VHVEG,1.0,,,,,,,
gender,7043.0,2.0,Male,3555.0,,,,,,,
SeniorCitizen,7043.0,,,,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
Partner,7043.0,2.0,No,3641.0,,,,,,,
Dependents,7043.0,2.0,No,4933.0,,,,,,,
tenure,7043.0,,,,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
PhoneService,7043.0,2.0,Yes,6361.0,,,,,,,
MultipleLines,7043.0,3.0,No,3390.0,,,,,,,
InternetService,7043.0,3.0,Fiber optic,3096.0,,,,,,,
OnlineSecurity,7043.0,3.0,No,3498.0,,,,,,,


# Fix TotalCharges
- convert to numeric
- remove NaN rows

In [67]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [68]:
df['TotalCharges'].isna().sum() # 11

np.int64(11)

In [69]:
# df = df[~df['TotalCharges'].isna()].copy()
df = df[df['TotalCharges'].notna()].copy()

In [70]:
df['TotalCharges'].isna().sum() # 0

np.int64(0)

In [71]:
df.shape

(7032, 21)

# Convert Categorical Column

In [72]:
cat_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod', 'Churn'
]

In [73]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [74]:
df.dtypes

customerID            object
gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
Churn               category
dtype: object

# Standardized Categorical Values

Replace "No internet service" & "No phone service" with "No"

In [75]:
internet_related_cols = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

In [76]:
for col in cat_cols:
    df[col] = df[col].replace({'No internet service': 'No'})

  df[col] = df[col].replace({'No internet service': 'No'})


In [77]:
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

  df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})


In [78]:
# validation check
for col in internet_related_cols + ['MultipleLines']:
    print(col, df[col].unique())

OnlineSecurity ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
OnlineBackup ['Yes', 'No']
Categories (2, object): ['No', 'Yes']
DeviceProtection ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
TechSupport ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
StreamingTV ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
StreamingMovies ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
MultipleLines ['No', 'Yes']
Categories (2, object): ['No', 'Yes']


# Final Validation Checks

In [84]:
df.shape

(7032, 21)

In [79]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [80]:
df.dtypes

customerID            object
gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
Churn               category
dtype: object

In [81]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
customerID,7032.0,7032.0,7590-VHVEG,1.0,,,,,,,
gender,7032.0,2.0,Male,3549.0,,,,,,,
SeniorCitizen,7032.0,,,,0.1624,0.368844,0.0,0.0,0.0,0.0,1.0
Partner,7032.0,2.0,No,3639.0,,,,,,,
Dependents,7032.0,2.0,No,4933.0,,,,,,,
tenure,7032.0,,,,32.421786,24.54526,1.0,9.0,29.0,55.0,72.0
PhoneService,7032.0,2.0,Yes,6352.0,,,,,,,
MultipleLines,7032.0,2.0,No,4065.0,,,,,,,
InternetService,7032.0,3.0,Fiber optic,3096.0,,,,,,,
OnlineSecurity,7032.0,2.0,No,5017.0,,,,,,,


# Export Cleaned Dataset

In [82]:
os.makedirs("data", exist_ok=True)

In [83]:
df.to_csv("data/cleaned_dataset_v1.csv", index=False)