In [None]:
# churn_eda.ipynb

# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 1️⃣ Load dataset
df = pd.read_csv("data/Telco-Customer-Churn.csv")
df.head()

# 2️⃣ Check info & missing values
print(df.info())
print(df.isnull().sum())
print(df['Churn'].value_counts())

# 3️⃣ Drop customerID
df = df.drop('customerID', axis=1)

# 4️⃣ Convert 'TotalCharges' to numeric & fill missing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# 5️⃣ Encode categorical features
for col in df.select_dtypes(include='object').columns:
    if col != 'Churn':
        df[col] = LabelEncoder().fit_transform(df[col])

# Encode target
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

# 6️⃣ Feature Engineering
df['tenure_group'] = pd.cut(df['tenure'], bins=[0,12,24,48,60,72], labels=False)
df['MonthlyCharges_x_Tenure'] = df['MonthlyCharges'] * df['tenure']

# 7️⃣ Visualize churn distribution
sns.countplot(x='Churn', data=df)
plt.show()

# 8️⃣ Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, fmt=".2f")
plt.show()

# 9️⃣ Split data
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("EDA, preprocessing, feature engineering complete!")
