In [2]:
# 📌 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# 📌 Step 2: Load Cleaned Dataset
df = pd.read_csv("../data/processed/telco_cleaned.csv")
print(f"Dataset Shape: {df.shape}")

# 📌 Step 3: Handle Missing Values
print("\nMissing Values Before Handling:")
print(df.isnull().sum())

# Fill missing values only for numerical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())  # Fill missing numerical values with median

# Drop any remaining rows with missing values (e.g., categorical columns)
df.dropna(inplace=True)

print("\nMissing Values After Handling:")
print(df.isnull().sum())


# 📌 Step 4: Encode Categorical Features
categorical_cols = df.select_dtypes(include=["object"]).columns
encoder = LabelEncoder()

for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

print("\nCategorical Features Encoded Successfully!")

# 📌 Step 5: Normalize Numerical Features
scaler = MinMaxScaler()
num_cols = df.select_dtypes(include=["int64", "float64"]).columns

df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nNumerical Features Scaled Successfully!")

# 📌 Step 6: Split Data into Train & Test Sets
X = df.drop(columns=["Churn"])
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 📌 Step 7: Save Processed Data
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("✅ Data Preprocessing Completed & Files Saved!")


Dataset Shape: (7043, 21)

Missing Values Before Handling:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Missing Values After Handling:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges    