<a href="https://colab.research.google.com/github/simonklaren/telco-churn-pipeline/blob/main/notebooks/telco_customer_churn_sklearn_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
#imports

import pandas as pd

#sklearn imports
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, confusion_matrix

#imports voor export model
import joblib
import os

#Data importeren

In [21]:
#read csv + test

df = pd.read_csv("telco_customer_churn.csv")
df.head()

# Numeriek maken van de TotalCharges kolom
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Rijen met tenure = 0 en TotalCharges NaN naar 0 zetten
mask_na = df['TotalCharges'].isna()
mask_tenure0 = df['tenure'] == 0
df.loc[mask_na & mask_tenure0, 'TotalCharges'] = 0

# Eventuele overige NaNâ€™s droppen (in deze data niet aanwezig)
df = df.dropna(subset=['TotalCharges'])

#Test-split maken

In [22]:
# target kolom
target_col = "Churn"

# features en target splitsen
X = df.drop(columns=[target_col])
y = df[target_col] # yes/no

# train / test set maken
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

#kolommen splitsen (numeriek en categoriaal)

In [23]:
numeric_features = [
    "tenure",
    "MonthlyCharges",
    "TotalCharges",
]

categorical_features = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
]


#data ml-ready maken

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features), # alle numeric features door een StandardScaler
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features), # alle categorical features door een OneHotEncoder
    ]
)

#pipeline

In [25]:
clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)

# stappen + model details
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", clf),
])

#testen en evalueeren

In [26]:
# train model met train data
model.fit(X_train, y_train)

# voorspel
y_pred = model.predict(X_test)

# print model evaluatie
print("Classification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

Classification report:
              precision    recall  f1-score   support

          No       0.83      0.89      0.86      1035
         Yes       0.62      0.48      0.54       374

    accuracy                           0.78      1409
   macro avg       0.72      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409

Confusion matrix:
[[923 112]
 [194 180]]


In [27]:
# map maken voor het model
os.makedirs("models", exist_ok=True)

# pipeline opslaan
joblib.dump(model, "models/churn_pipeline.pkl")

['models/churn_pipeline.pkl']

In [28]:

print(os.listdir("models"))

['churn_pipeline.pkl']
