# Machine Learning Pipeline


In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    roc_auc_score, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, classification_report
)

In [5]:
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/Multicampus-8/프로젝트/'

train_df = pd.read_csv(DATA_PATH + "train.csv")
test_df = pd.read_csv(DATA_PATH + "test.csv")
# train_df.shape, test_df.shape
target = "diagnosed_diabetes"

In [1]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from lightgbm import LGBMClassifier


In [7]:
X = train_df.drop(["id", target], axis=1, errors="ignore")
y = train_df[target]

X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [8]:

# Feature Groups
numeric_features = [
    'waist_to_hip_ratio', 'alcohol_consumption_per_week',
    'cholesterol_total', 'cardiovascular_history',
    'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'sleep_hours_per_day',
    'diet_score', 'systolic_bp', 'age',
    'screen_time_hours_per_day',
    'physical_activity_minutes_per_week',
    'hypertension_history', 'heart_rate',
    'diastolic_bp', 'hdl_cholesterol'
]

categorical_features = [
    'smoking_status', 'employment_status',
    'ethnicity', 'gender', 'education_level'
]

target_ordinal = ['income_level']
income_cats = ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']


In [9]:

# Train / Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [10]:

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("ord", OrdinalEncoder(
            categories=[income_cats],
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), target_ordinal)
    ]
)


In [None]:

# Model + Random Search
lgbm = LGBMClassifier(random_state=42, n_jobs=-1)

pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", lgbm)
])

param_dist = {
    "model__n_estimators": [200, 300, 400],
    "model__max_depth": [4, 6, 8],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8, 0.9]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)
best_model = search.best_estimator_


In [None]:

# Feature Importance
feature_names = best_model.named_steps["prep"].get_feature_names_out()
importances = best_model.named_steps["model"].feature_importances_

importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

importance_df.head()


In [None]:

# Feature Removal
remove_features = [
    "alcohol_consumption_per_week",
    "sleep_hours_per_day",
    "diastolic_bp"
]

numeric_features_reduced = [
    f for f in numeric_features if f not in remove_features
]


In [None]:

# Reduced Preprocessor
preprocessor_reduced = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features_reduced),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("ord", OrdinalEncoder(
            categories=[income_cats],
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), target_ordinal)
    ]
)


In [None]:

# Final Model Training
final_pipeline = Pipeline([
    ("prep", preprocessor_reduced),
    ("model", best_model.named_steps["model"])
])

final_pipeline.fit(X, y)


In [None]:

# Test Prediction
test_X = test_df.drop(columns=[id_col])
test_proba = final_pipeline.predict_proba(test_X)[:, 1]

submission = pd.DataFrame({
    "id": test_df[id_col],
    "diagnosed_diabetes": test_proba
})

submission.to_csv("submission_final.csv", index=False)
submission.head()
