In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
import mlflow
import mlflow.sklearn
import dagshub

In [None]:
data = pd.read_csv("../data/raw/diabetes.csv")

# ---------> preprocess


data["gender"] = data["gender"].map({"Female": 0, "Male": 1})

data["gender"] = data["gender"].fillna(data["gender"].mode()[0])
transformer = ColumnTransformer(
    transformers=[("cat-ohe", OneHotEncoder(drop="first"), ["smoking_history"])],
    remainder="passthrough",
)

In [None]:
# ---------------->

X = data.drop(columns="diabetes")
y = data["diabetes"]
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit transformer on x_train only
X_train_transformed = pd.DataFrame(
    transformer.fit_transform(x_train),
    columns=transformer.get_feature_names_out(),
    index=x_train.index,
)

# Transform x_test (no fitting)
X_test_transformed = pd.DataFrame(
    transformer.transform(x_test),
    columns=transformer.get_feature_names_out(),
    index=x_test.index,
)

In [10]:
X_train_transformed.head()

Unnamed: 0,cat-ohe__smoking_history_current,cat-ohe__smoking_history_ever,cat-ohe__smoking_history_former,cat-ohe__smoking_history_never,cat-ohe__smoking_history_not current,remainder__gender,remainder__age,remainder__hypertension,remainder__heart_disease,remainder__bmi,remainder__HbA1c_level,remainder__blood_glucose_level
75220,0.0,0.0,1.0,0.0,0.0,1.0,73.0,0.0,0.0,24.77,3.5,80.0
48955,0.0,0.0,0.0,1.0,0.0,1.0,80.0,0.0,0.0,24.6,5.7,145.0
44966,0.0,1.0,0.0,0.0,0.0,0.0,38.0,0.0,0.0,24.33,4.0,158.0
13568,0.0,0.0,0.0,0.0,1.0,0.0,26.0,0.0,0.0,18.87,5.0,100.0
92727,1.0,0.0,0.0,0.0,0.0,0.0,61.0,1.0,0.0,22.11,4.5,85.0


In [None]:
data.iloc[75220, :]

gender                    1.0
age                      73.0
hypertension                0
heart_disease               0
smoking_history        former
bmi                     24.77
HbA1c_level               3.5
blood_glucose_level        80
diabetes                    0
Name: 75220, dtype: object