In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
import os

In [6]:
df = pd.read_csv("telco_customer_churn.csv")

In [7]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,cust_0,Male,0,Yes,Yes,63,Yes,No,No,No internet service,...,Yes,No internet service,No,Yes,One year,No,Mailed check,53.57,8131.42,No
1,cust_1,Female,1,Yes,Yes,12,Yes,No,No,Yes,...,No internet service,No internet service,No,No internet service,Month-to-month,Yes,Mailed check,99.39,655.05,No
2,cust_2,Male,0,No,No,11,Yes,No,No,No internet service,...,Yes,Yes,Yes,No,One year,Yes,Electronic check,56.0,7201.27,No
3,cust_3,Male,0,Yes,Yes,13,Yes,No,No,Yes,...,Yes,Yes,No internet service,Yes,One year,No,Credit card (automatic),104.86,3069.05,No
4,cust_4,Male,0,Yes,Yes,35,Yes,Yes,No,Yes,...,No internet service,Yes,No,No internet service,Month-to-month,No,Credit card (automatic),64.44,7534.5,No


In [8]:
# Fix TotalCharges type
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


In [None]:
# Drop customerID
df = df.drop("customerID", axis=1)


In [None]:
# 2. Split features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]


In [None]:
# Encode target
le = LabelEncoder()
y = le.fit_transform(y)  # Yes/No -> 1/0
os.makedirs("models", exist_ok=True)
joblib.dump(le, "models/label_encoder.pkl")


In [None]:
# 3. Separate numeric & categorical
# ------------------------
numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(exclude=np.number).columns.tolist()


In [None]:
# 4. Encode categorical
# ------------------------
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_categorical_encoded = ohe.fit_transform(X[categorical_cols])
joblib.dump(ohe, "models/ohe_encoder.pkl")


In [None]:
# 5. Scale numeric
# ------------------------
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X[numeric_cols])
joblib.dump(scaler, "models/scaler.pkl")

In [9]:
# 6. Combine numeric + categorical
# ------------------------
X_processed = np.hstack([X_numeric_scaled, X_categorical_encoded])

In [10]:
# 7. Train-Test Split (80-20)
# ------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 8. Handle imbalance with SMOTE (ONLY on training data)
# ------------------------
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)


In [12]:
# 9. Save processed arrays
# ------------------------
os.makedirs("data/processed", exist_ok=True)

np.save("data/processed/X_train_balanced.npy", X_train_balanced)
np.save("data/processed/y_train_balanced.npy", y_train_balanced)
np.save("data/processed/X_test.npy", X_test)
np.save("data/processed/y_test.npy", y_test)