# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
pd.set_option('display.max_columns', None)

# Load Dataset & Quick Scan

In [None]:
import kagglehub
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

In [None]:
filename = os.listdir(path)[0]
fp = os.path.join(path, filename)
df = pd.read_csv(fp)

In [None]:
df.head()

In [None]:
# df.shape
# df.columns.tolist()
# df.dtypes
df.info()

In [None]:
df.describe(include='all').T

# Fix TotalCharges
- convert to numeric
- remove NaN rows

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df['TotalCharges'].isna().sum() # 11

In [None]:
# df = df[~df['TotalCharges'].isna()].copy()
df = df[df['TotalCharges'].notna()].copy()

In [None]:
df['TotalCharges'].isna().sum() # 0

In [None]:
df.shape

# Convert Categorical Column

In [None]:
cat_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod', 'Churn'
]

In [None]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [None]:
df.dtypes

# Standardized Categorical Values

Replace "No internet service" & "No phone service" with "No"

In [None]:
internet_related_cols = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

In [None]:
for col in cat_cols:
    df[col] = df[col].replace({'No internet service': 'No'})

In [None]:
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

In [None]:
# validation check
for col in internet_related_cols + ['MultipleLines']:
    print(col, df[col].unique())

# Final Validation Checks

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.describe(include='all').T

# Export Cleaned Dataset

In [None]:
os.makedirs("data", exist_ok=True)

In [None]:
df.to_csv("data/cleaned_dataset_v1.csv", index=False)