In [1]:
import argparse
import json
import sys
from pathlib import Path

import optuna
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    accuracy_score,
    roc_curve,
    precision_recall_curve,
    auc,
)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import lightgbm as lgb
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common._registration import get_shape_calculator
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
    convert_lightgbm,
)
from skl2onnx.common.data_types import (
    FloatTensorType,
    Int64TensorType,
    StringTensorType,
)
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
    calculate_linear_regressor_output_shapes,
)

import matplotlib

# 画像をファイルに保存するだけなので、X サーバの無い環境でも動くようにバックエンドを固定
matplotlib.use("Agg")
import matplotlib.pyplot as plt

%matplotlib inline


from pathlib import Path
# from model_export import export_pipeline_to_onnx, export_pipeline_to_pmml

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEFAULT_DATA = "../data/Titanic-Dataset.csv"
DEFAULT_TEST_DATA = "../data/Titanic-Dataset.csv"
DEFAULT_REPORT_DIR = "../reports/titanic/random_forest"
DEFAULT_PMML_PATH = "../model/titanic_lightgbm.pmml"
DEFAULT_ONNX_PATH = "../model/titanic_lightgbm.onnx"


DEFAULT_DATA = Path(DEFAULT_DATA)
DEFAULT_TEST_DATA = Path(DEFAULT_TEST_DATA)
DEFAULT_REPORT_DIR = Path(DEFAULT_REPORT_DIR)
DEFAULT_PMML_PATH = Path(DEFAULT_PMML_PATH)
DEFAULT_ONNX_PATH = Path(DEFAULT_ONNX_PATH)

FEATURES = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
TARGET = "Survived"
NUMERIC_FEATURES = ["Age", "SibSp", "Parch", "Fare"]
CATEGORICAL_FEATURES = ["Pclass", "Sex", "Embarked"]
STRING_FEATURES = list(CATEGORICAL_FEATURES)

In [3]:
df = pd.read_csv(DEFAULT_DATA)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
missing_cols = set(FEATURES + [TARGET]) - set(df.columns)

df = df.dropna(subset=[TARGET])
X_train_val = df[FEATURES].copy()
y_train_val = df[TARGET].astype(int)
for col in CATEGORICAL_FEATURES:
    X_train_val[col] = X_train_val[col].astype("string").fillna("missing").astype(str)

In [5]:
missing_cols = set(FEATURES + [TARGET]) - set(df.columns)

df = df.dropna(subset=[TARGET])
X_test = df[FEATURES].copy()
y_test = df[TARGET].astype(int)
for col in CATEGORICAL_FEATURES:
    X_test[col] = X_test[col].astype("string").fillna("missing").astype(str)

In [6]:
# 数値列の欠損には中央値で埋め、スケールを合わせる。
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)
# カテゴリ列は欠損を事前に埋めてからワンホットエンコードする（欠損埋めは load_dataset で実施）。
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES),
    ]
)

# ColumnTransformer で列ごとの処理を分岐させる。
# こうしておけば PMML / ONNX へエクスポートするときも前処理を丸ごと含められる。
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES),
    ]
)

In [7]:
test_size = 0.2
random_state = 0


def objective(trial: optuna.trial.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
    }

    model = lgb.LGBMClassifier(
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        num_leaves=params["num_leaves"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        min_child_samples=params["min_child_samples"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
        objective="binary",
        random_state=random_state,
        n_jobs=-1,
    )

    pipeline = Pipeline(steps=[("preprocess", preprocessor), ("model", model)])
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train_val,
        y_train_val,
        test_size=test_size,
        random_state=random_state,
        stratify=y_train_val,
    )
    pipeline.fit(X_train, y_train)
    proba = pipeline.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, proba)


random_state = 42
n_trials = 1
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=random_state),
)
study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
print(f"Best trial ROC AUC: {study.best_value:.4f}")
print(f"Best params: {study.best_params}")
best_params = study.best_params

[I 2025-11-10 01:11:44,183] A new study created in memory with name: no-name-b2ac979b-c558-46f3-be2f-c2e51b9167d4


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 213
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028


[I 2025-11-10 01:11:44,769] Trial 0 finished with value: 0.8391963109354414 and parameters: {'n_estimators': 637, 'learning_rate': 0.17254716573280354, 'num_leaves': 192, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'min_child_samples': 39, 'reg_alpha': 3.3323645788192616e-08, 'reg_lambda': 0.6245760287469887}. Best is trial 0 with value: 0.8391963109354414.


Best trial ROC AUC: 0.8392
Best params: {'n_estimators': 637, 'learning_rate': 0.17254716573280354, 'num_leaves': 192, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'min_child_samples': 39, 'reg_alpha': 3.3323645788192616e-08, 'reg_lambda': 0.6245760287469887}


In [8]:
best_params["n_estimators"]

637

In [9]:
model = lgb.LGBMClassifier(
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    num_leaves=best_params["num_leaves"],
    subsample=best_params["subsample"],
    colsample_bytree=best_params["colsample_bytree"],
    min_child_samples=best_params["min_child_samples"],
    reg_alpha=best_params["reg_alpha"],
    reg_lambda=best_params["reg_lambda"],
    objective="binary",
    random_state=random_state,
    n_jobs=-1,
)
pipeline = Pipeline(steps=[("preprocess", preprocessor), ("model", model)])
pipeline.fit(X_train_val, y_train_val)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,192
,max_depth,-1
,learning_rate,0.17254716573280354
,n_estimators,637
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
label = "external test"
preds = pipeline.predict(X_test)
proba = pipeline.predict_proba(X_test)[:, 1]

# Standard metrics
report = classification_report(y_test, preds)
fpr, tpr, _ = roc_curve(y_test, proba)
roc_auc = roc_auc_score(y_test, proba)
acc = accuracy_score(y_test, preds)

# ➕ PR-AUC computation
precision, recall, _ = precision_recall_curve(y_test, proba)
pr_auc = auc(recall, precision)

# Print results
print(f"\n=== Evaluation on {label} set ===")
print(report)
print(f"Accuracy: {acc:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC:  {pr_auc:.4f}")


=== Evaluation on external test set ===
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       549
           1       0.97      0.94      0.96       342

    accuracy                           0.97       891
   macro avg       0.97      0.96      0.96       891
weighted avg       0.97      0.97      0.97       891

Accuracy: 0.9663
ROC AUC: 0.9958
PR AUC:  0.9938




In [11]:
label_key = "external_test"

# === Save metrics (JSON) ===
metrics = {
    "roc_auc": roc_auc,
    "pr_auc": pr_auc,  # ➕ include PR-AUC
    "accuracy": acc,
    "support": len(y_test),
}
metrics_path = DEFAULT_REPORT_DIR / f"{label_key}_metrics.json"
with metrics_path.open("w") as f:
    json.dump(metrics, f, indent=2)

# === Plot ROC Curve ===
plt.figure(figsize=(6, 6))

plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.4f})")

plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve - {label}")
plt.legend(loc="lower right")
plt.tight_layout()
roc_path = DEFAULT_REPORT_DIR / f"{label_key}_roc_curve.png"
plt.savefig(roc_path, dpi=120)
plt.close()  # Close after saving

# === Plot Precision–Recall Curve ===
plt.figure(figsize=(6, 6))
plt.plot(recall, precision, label=f"PR curve (AUC = {pr_auc:.4f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall Curve - {label}")
plt.legend(loc="lower left")
plt.grid(True)
plt.tight_layout()
pr_path = DEFAULT_REPORT_DIR / f"{label_key}_pr_curve.png"
plt.savefig(pr_path, dpi=120)
plt.close()

# === Print results to console ===
print(f"\n=== Evaluation on {label} set ===")
print(report)
print(f"Accuracy: {acc:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"PR AUC:  {pr_auc:.4f}")
print(f"Saved reports to: {DEFAULT_REPORT_DIR.resolve()}")


=== Evaluation on external test set ===
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       549
           1       0.97      0.94      0.96       342

    accuracy                           0.97       891
   macro avg       0.97      0.96      0.96       891
weighted avg       0.97      0.97      0.97       891

Accuracy: 0.9663
ROC AUC: 0.9958
PR AUC:  0.9938
Saved reports to: /Users/masatosasaki/Desktop/real-time-inference/reports/titanic/random_forest


In [12]:
sklearn2pmml(pipeline, DEFAULT_PMML_PATH, with_repr=True)

In [14]:
update_registered_converter(
    lgb.LGBMClassifier,
    "LightGbmLGBMClassifier",
    calculate_linear_classifier_output_shapes,
    convert_lightgbm,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

In [15]:
initial_type = []

# 数値列は float 型
for col in NUMERIC_FEATURES:
    initial_type.append((col, FloatTensorType([None, 1])))

# カテゴリ列は string 型
for col in CATEGORICAL_FEATURES:
    initial_type.append((col, StringTensorType([None, 1])))

model_onnx = convert_sklearn(
    pipeline,
    "pipeline_lightgbm",
    initial_type,
    target_opset={"": 12, "ai.onnx.ml": 2},
)

# And save.
with open(DEFAULT_ONNX_PATH, "wb") as f:
    f.write(model_onnx.SerializeToString())
