In [1]:
import sys
sys.path.append("..")

import pandas as pd
from sklearn.model_selection import train_test_split

from src.preprocessing import (
    fix_total_charges,
    drop_identifier_columns,
    build_preprocessor,
    save_preprocessor
)


In [2]:
df = pd.read_csv("../data/raw/telco_churn.csv")

X = df.drop(columns=["Churn"])
y = df["Churn"].map({"Yes": 1, "No": 0})


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [4]:
X_train = fix_total_charges(X_train)
X_test = fix_total_charges(X_test)

X_train = drop_identifier_columns(X_train)
X_test = drop_identifier_columns(X_test)


In [5]:
preprocessor = build_preprocessor(X_train)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [6]:
save_preprocessor(preprocessor, "../models/preprocessor.pkl")