In [None]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier



df = pd.read_csv("../data/bank-additional-full.csv", sep=";")

# Encoding target
df["y"] = df["y"].map({"yes": 1, "no": 0})

X = df.drop("y", axis=1)
y = df["y"]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Defining preprocessor

categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

# Building pipelines

log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

knn_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

nb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GaussianNB())
])

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ))
])

# Training models on raw data

log_reg_pipeline.fit(X_train, y_train)
dt_pipeline.fit(X_train, y_train)
knn_pipeline.fit(X_train, y_train)
nb_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

# saving pipelines

os.makedirs("models", exist_ok=True)

pickle.dump(log_reg_pipeline, open("models/logistic_regression.pkl", "wb"))
pickle.dump(dt_pipeline, open("models/decision_tree.pkl", "wb"))
pickle.dump(knn_pipeline, open("models/knn.pkl", "wb"))
pickle.dump(nb_pipeline, open("models/naive_bayes.pkl", "wb"))
pickle.dump(rf_pipeline, open("models/random_forest.pkl", "wb"))
pickle.dump(xgb_pipeline, open("models/xgboost.pkl", "wb"))

print("All pipelines saved successfully!")


Parameters: { "use_label_encoder" } are not used.



âœ… All production-ready pipelines saved successfully!
