# Day 1 to 5:
1. Load Dataset & Quick Scan / Exploration
2. Data Clean
3. Data Analysis
4. Feature Engieering & Outlier Detection
5. Full EDA + Visualization + Insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
pd.set_option('display.max_columns', None)

## Data Load & Quick Scan / Exploration

In [None]:
import kagglehub
path = kagglehub.dataset_download('blastchar/telco-customer-churn')

filename = os.listdir(path)[0]
fp = os.path.join(path, filename)

In [None]:
df = pd.read_csv(fp)
# df.head(3)
# df.shape
# df.dtypes
# df.columns.tolist()
# df.info()
# df.describe(include='all').T

## Data Clean

In [None]:
missing = df.isna().sum().to_frame('missing_count')
missing['missing_pct'] = (missing['missing_count'] / len(df)) * 100
missing

In [None]:
duplicates = df.duplicated().sum()
duplicates

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

print(f"Missing count before: {df['TotalCharges'].isna().sum()}")

df = df[df['TotalCharges'].notna()].copy()

print(f"Missing count after: {df['TotalCharges'].isna().sum()}")

In [None]:
unique_vals = df.nunique()
unique_vals

In [None]:
cat_threshold = 4
low_cardinality_cols = unique_vals[unique_vals <= cat_threshold].index

print(f"Columns to be converted to category: {len(low_cardinality_cols)}\n")
low_cardinality_cols

In [None]:
# Iterative Series Assignment -> slower than vectorized operations for very large numbers of columns
# for col in low_cardinality_cols:
#     df[col] = df[col].astype('category')

# Vectorized DataFrame Assignment -> efficient
df[low_cardinality_cols] = df[low_cardinality_cols].astype('category')

df.dtypes

In [None]:
internet_related_col = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

for col in internet_related_col:
    # print(f"{df[col].value_counts()}\n")
    df[col] = df[col].replace({'No internet service': 'No'})

df['MultipleLines'] = df['MultipleLines'].replace({'No phone service': 'No'})

# validation
for col in internet_related_col + ['MultipleLines']:
    print(f"{col}: {df[col].nunique()}")

In [None]:
# df.isna().sum()

# df.dtypes

# df.describe(include='all').T

In [None]:
os.makedirs("data", exist_ok=True)

df.to_csv("data/cleaned_dataset_v1.csv", index=False)

## Data Analysis

In [None]:
churn_summary = df['Churn'].value_counts().to_frame('count')
churn_summary['percent'] = round((churn_summary['count'] / len(df)) * 100, 2)
churn_summary

In [None]:
demographic_cols = ['gender', 'SeniorCitizen', 'Dependents']

demographic_churn = {}

for col in demographic_cols:
    table = round(pd.crosstab(df[col], df['Churn'], normalize='index') * 100, 2)
    demographic_churn[col] = table
    print(f"--- {col} vs Churn (%) ---\n{table}\n")

In [None]:
service_cols = [
    'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

service_churn = {}

for col in service_cols:
    table = round(pd.crosstab(df[col], df['Churn'], normalize='index') * 100, 2)
    service_churn[col] = table
    print(f"--- {col} vs Churn (%) ---\n{table}\n")

In [None]:
account_cols = ['Contract', 'PaymentMethod']

account_churn = {}

for col in account_cols:
    table = round(pd.crosstab(df[col], df['Churn'], normalize='index') * 100, 2)
    account_churn[col] = table
    print(f"--- {col} vs Churn (%) ---\n{table}\n")

In [None]:
bins = [x for x in range(0, 73, 12)]

labels = ['0-12', '13-24', '25-36', '37-48', '49-60', '61-72']

df['tenure_group'] = pd.cut(df['tenure'], bins=bins, labels=labels, include_lowest=True)

tenure_churn = round(pd.crosstab(df['tenure_group'], df['Churn'], normalize='index') * 100, 2)
tenure_churn

In [None]:
arpu_summary = round(df.groupby('Churn')['MonthlyCharges'].mean(), 2)
arpu_summary

In [None]:
df['LTV'] = df['MonthlyCharges'] * df['tenure']

ltv_summary = round(df.groupby('Churn')['TotalCharges'].mean(), 2)
ltv_summary

Notes: Why Recalculate LTV (feature creation)
- The key reason for calculating this new LTV is to create a feature that captures the **monetary expectation** or the **current customer valuation**, rather than relying on the `TotalCharges` which is slightly less than LTV as the customers often receive introductory discounts or special promotions.

In [None]:
arpu_by_contract = round(df.groupby('Contract')['MonthlyCharges'].mean(), 2)
arpu_by_contract

## Feature Engineering & Outlier Detection

In [None]:
num_cols = ['MonthlyCharges', 'TotalCharges', 'LTV']

for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot - {col}")
    plt.show()

    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - (1.5 * iqr)
    upper = q3 + (1.5 * iqr)

    print(f"{col}: Lower={lower}, Upper={upper}")
    print(f"Outlier Count: {df[(df[col] < lower) | (df[col] > upper)].shape[0]}")

In [None]:
df['tenure_group'] = df['tenure_group'].astype('category')
df['tenure_group'] = df['tenure_group'].cat.reorder_categories(['0-12', '13-24', '25-36', '37-48', '49-60', '61-72'], ordered=True)
df['tenure_group'].value_counts()

In [None]:
df['ARPU_tier'] = pd.qcut(df['MonthlyCharges'], q=3, labels=['Low', 'Medium', 'High'])
df['ARPU_tier'] = df['ARPU_tier'].astype('category')
df['ARPU_tier'].value_counts()

In [None]:
df['is_electronic_check'] = (df['PaymentMethod'] == 'Electronic check').astype('int')
df['is_electronic_check'].value_counts()

In [None]:
df['is_monthly_contract'] = (df['Contract'] == 'Month-to-month').astype('int')
df['is_monthly_contract'].value_counts()

In [None]:
df['is_fiber'] = (df['InternetService'] == 'Fiber optic').astype('int')
df['is_fiber'].value_counts()

In [None]:
df['no_tech_support'] = (df['TechSupport'] == 'No').astype('int')
df['no_online_security'] = (df['OnlineSecurity'] == 'No').astype('int')
df['no_device_protection'] = (df['DeviceProtection'] == 'No').astype('int')

print(df['no_tech_support'].value_counts(), "\n", df['no_online_security'].value_counts(), "\n", df['no_device_protection'].value_counts())

In [None]:
df['is_senior'] = (df['SeniorCitizen'] == 1).astype('int')
df['is_senior'].value_counts()

In [None]:
df['avg_cost_per_month'] = df['TotalCharges'] / df['tenure'].replace(0, 1)

df['security_to_cost_ratio'] = df['no_online_security'] / df['MonthlyCharges']

In [None]:
contract_map = {
    'Month-to-month': 1,
    'One year': 12,
    'Two year': 24,
}

df['contract_length'] = df['Contract'].map(contract_map)
df['contract_length'] = df['contract_length'].astype('int')
df['contract_value_proxy'] = df['MonthlyCharges'] * df['contract_length']

In [None]:
# final validation
print(f"Shape: {df.shape}\n\n{'--- ' * 10}\n\n{df.head(3)}\n\n{'--- ' * 10}\n\nColumns: {len(df.columns.tolist())}\n\n{df.columns.tolist()}")

In [None]:
os.makedirs("data", exist_ok=True)
df.to_csv("data/cleaned_featured_dataset_v2.csv", index=False)

## Full EDA & Insights
- Basic Visual Validation
- Churn % for Engineered Features

### STEP 1 — Basic Visual Validation

In [None]:
# Churn rate across contract types

sns.barplot(data=df, x='Contract', y=df['Churn'].map({'No':0, 'Yes':1}))
plt.title("Churn Rate by Contract Type")
plt.show()

In [None]:
# Churn vs tenure groups

sns.barplot(df, x='tenure_group', y=df['Churn'].map({'No':0, 'Yes':1}))
plt.title("Churn Rate by Contract Type")
plt.show()

In [None]:
# Churn vs ARPU tier

sns.barplot(data=df, x='ARPU_tier', y=df['Churn'].map({'No':0, 'Yes':1}))
plt.title("Churn Rate by Revenue Tier")
plt.show()

In [None]:
# Churn vs risk flags

risk_cols = ['is_electronic_check','is_monthly_contract','is_fiber','no_online_security','no_tech_support']

for col in risk_cols:
    # sns.barplot(data=df, x=col, y=df['Churn'].map({'No':0, 'Yes':1}))
    sns.barplot(x=df[col], y=df['Churn'].map({'No':0, 'Yes':1}))
    plt.title(f"Churn Rate by {col}")
    plt.show()

### STEP 2 — Churn % for Engineered Features

In [None]:
# Churn by ARPU tier
round(pd.crosstab(df['ARPU_tier'], df['Churn'], normalize='index') * 100, 2)

In [None]:
# Churn by risk flags
for col in risk_cols + ['is_senior']:
    print(f"\n--- {col} ---")
    print(pd.crosstab(df[col], df['Churn'], normalize='index'))

In [None]:
# Churn by contract value proxy (bucketed)

df['contract_value_bucket'] = pd.qcut(df['contract_value_proxy'], 4, labels=['Q1_Low','Q2','Q3','Q4_High'])
pd.crosstab(df['contract_value_bucket'], df['Churn'], normalize='index') * 100

In [None]:
# final checks
df.shape