In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("../data/raw/telco_churn.csv")

X = df.drop(columns=["Churn"])
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


(5634, 20) (1409, 20)


In [2]:
y_train = y_train.map({"Yes": 1, "No": 0})
y_test = y_test.map({"Yes": 1, "No": 0})


In [3]:
def fix_total_charges(df):
    df = df.copy()
    df["TotalCharges"] = pd.to_numeric(
        df["TotalCharges"], errors="coerce"
    )
    return df


In [4]:

X_train = fix_total_charges(X_train)
X_test = fix_total_charges(X_test)


In [5]:
X_train.isnull().sum()


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        8
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer

num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

num_imputer = SimpleImputer(strategy="median")

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])


In [7]:
X_train = X_train.drop(columns=["customerID"])
X_test = X_test.drop(columns=["customerID"])


In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [12]:
cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


In [13]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [15]:
import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")


['../models/preprocessor.pkl']

In [16]:
import numpy as np

np.save("../data/processed/X_train.npy", X_train_processed)
np.save("../data/processed/X_test.npy", X_test_processed)
np.save("../data/processed/y_train.npy", y_train.values)
np.save("../data/processed/y_test.npy", y_test.values)


In [17]:
X_train_processed.shape
X_test_processed.shape


(1409, 45)