# モデル調査

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import json
from datetime import datetime
from pathlib import Path
import logging

# ロギング設定
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
from enum import Enum, auto
import sys
sys.path.append('../..')

from src.utils.io import load_month_data
from src.utils.preprocess import preprocess_pipeline


In [3]:
def train_and_log_experiment(
    data_info: list[int],
    model_name: str = "logistic_regression",
    params: dict | None = None,
    test_size: float = 0.2,
    random_state: int = 42,
    experiment_name: str = "citibike_membership",
):
    """
    学習とMLflowへのログを行う汎用関数

    Parameters
    ----------
    data_info :list[int]
        学習データ（CSV）取得に必要な情報 (ex）[2014, 1])
    model_name : str
        モデル名（MLflowのrun_nameなどに使用）
    params : dict
        モデルのハイパーパラメータ
    test_size : float
        テストデータ比率
    random_state : int
        乱数シード
    experiment_name : str
        MLflow実験名

    Returns
    -------
    dict : 実験結果のメトリクス
    """
    try:
        logger.info(f"Loading data from {data_info[0]}-{data_info[1]}")

        # データ読み込み
        df_org = load_month_data(*data_info)
        df = preprocess_pipeline(df_org)
        
        X = df.drop("is_member", axis=1)
        y = df["is_member"]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        logger.info(f"Data split: train={len(X_train)}, test={len(X_test)}")

        if params is None:
            params = {"max_iter": 500, "random_state": random_state}

        if model_name == "logistic_regression":
            model = LogisticRegression(**params)
        else:
            raise ValueError(f"Unsupported model: {model_name}")
    
        # MLflow設定
        mlflow.set_experiment(experiment_name)
    
        with mlflow.start_run(
            run_name=f"{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        ) as run:
            logger.info(f"MLflow run started: {run.info.run_id}")
            
            dataset_params = {
                "data_info": data_info,
                "test_size": test_size,
                "random_state": random_state,
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "n_features": X_train.shape[1],
                "feature_names": X_train.columns.tolist(),
                "class_distribution_train": y_train.value_counts().to_dict(),
                "class_distribution_test": y_test.value_counts().to_dict(),
            }
            
            # パラメータをログ
            mlflow.log_params({
                k: v for k, v in dataset_params.items()
                if k not in ["feature_names", "class_distribution_train", "class_distribution_test"]
            })
            
            # 特徴量名をファイルとして保存
            feature_path = Path("interim/features.json")
            feature_path.parent.mkdir(exist_ok=True, parents=True)
            with open(feature_path, "w") as f:
                json.dump(dataset_params["feature_names"], f, indent=2)           
            mlflow.log_artifact(str(feature_path), "dataset_info")
            
            # クラス分布を記録
            class_dist_path = Path("interim/class_distribution.json")
            class_dist_path.parent.mkdir(exist_ok=True, parents=True)
            with open(class_dist_path, "w") as f:
                json.dump({
                    "train": dataset_params["class_distribution_train"],
                    "test": dataset_params["class_distribution_test"]
                }, f, indent=2)
            mlflow.log_artifact(str(class_dist_path), "dataset_info")        
            
            # データセットの追跡（スナップショットを記録）
            try:
                mlflow.log_input(
                    mlflow.data.from_pandas(            # type: ignore
                        df,
                        name=f"citibike_data_{datetime.now().strftime('%Y%m%d')}",
                    ),
                    context="training",
                )
                logger.info("Dataset logged to MLflow")
            except Exception as e:
                logger.warning(f"Failed to log dataset: {e}")
                
            # パラメーターをログ
            mlflow.log_params(params) # type: ignore

            logger.info("Training model...")
            model.fit(X_train, y_train)
            
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)          

            # 確率予測がある場合は記録
            if hasattr(model, "predict_proba"):
                y_proba_test = model.predict_proba(X_test)         
            
            metrics = {
                # テストセットのメトリクス
                "test_accuracy": accuracy_score(y_test, y_pred_test),
                "test_precision": precision_score(y_test, y_pred_test, zero_division=0),
                "test_recall": recall_score(y_test, y_pred_test, zero_division=0),
                "test_f1_score": f1_score(y_test, y_pred_test, zero_division=0),
                # 訓練セットのメトリクス（過学習チェック用）
                "train_accuracy": accuracy_score(y_train, y_pred_train),
                "train_f1_score": f1_score(y_train, y_pred_train, zero_division=0),
            }
            
            # メトリクスを一括ログ
            mlflow.log_metrics(metrics)        
            
            logger.info(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
            logger.info(f"Test F1-Score: {metrics['test_f1_score']:.4f}")
    
            # 混同行列を保存
            cm = confusion_matrix(y_test, y_pred_test)
            cm_path = Path("interim/confusion_matrix.json")
            cm_path.parent.mkdir(exist_ok=True, parents=True)
            with open(cm_path, "w") as f:
                json.dump({
                    "matrix": cm.tolist(),
                    "labels": ["Non-Member", "Member"]
                }, f, indent=2)
            mlflow.log_artifact(str(cm_path), "evaluation")
            
            # モデルの保存
            signature = mlflow.models.infer_signature(      # type: ignore
                X_train, 
                model.predict(X_train)
            )

            mlflow.sklearn.log_model(                       # type: ignore
                model,
                name="model",
                signature=signature,
                input_example=X_train.iloc[:5],  # サンプル入力例
            )
            
            mlflow.set_tags({
                "model_type": model_name,
                "framework": "sklearn",
                "dataset": data_info,
                "best_metric": "f1_score",
            })
            
            logger.info(f"Experiment logged successfully (run_id={run.info.run_id})")
            
        return metrics

    except Exception as e:
        logger.error(f"Error during experiment: {str(e)}")
        if mlflow.active_run():
            mlflow.end_run(status="FAILED")
        raise

In [4]:
metrics = train_and_log_experiment(
    data_info=[2014, 1],
    model_name="logistic_regression",
    params={
        "max_iter": 1000,
        "C": 1.0,
        "solver": "lbfgs",
        "random_state": 42,
    },
    test_size=0.2,
    random_state=42
)

print(metrics)

INFO:__main__:Loading data from 2014-1


Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]


INFO:__main__:Data split: train=240077, test=60020
INFO:__main__:MLflow run started: 22b3106ba26b46f79534e789ffa92df2
INFO:__main__:Dataset logged to MLflow
INFO:__main__:Training model...
INFO:__main__:Test Accuracy: 1.0000
INFO:__main__:Test F1-Score: 1.0000
INFO:__main__:Experiment logged successfully (run_id=22b3106ba26b46f79534e789ffa92df2)


🏃 View run logistic_regression_20251022_122023 at: http://mlflow:5000/#/experiments/1/runs/22b3106ba26b46f79534e789ffa92df2
🧪 View experiment at: http://mlflow:5000/#/experiments/1
{'test_accuracy': 1.0, 'test_precision': 1.0, 'test_recall': 1.0, 'test_f1_score': 1.0, 'train_accuracy': 0.9999375200456521, 'train_f1_score': 0.9999679880488715}


In [5]:
from src.train.experiment import run_experiment

metrics = run_experiment(
    data_info=[2014, 1],
    model_name="logistic_regression",
    params={"max_iter": 500, "random_state": 42},
    experiment_name="citibike_membership"
)

print(metrics)

Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]




🏃 View run logistic_regression_20251022_122051 at: http://mlflow:5000/#/experiments/1/runs/aff718a4185e46b1a8843d8b3343800f
🧪 View experiment at: http://mlflow:5000/#/experiments/1
{'train_accuracy': 0.9999375200456521, 'train_f1_score': 0.9999679880488715, 'test_accuracy': 1.0, 'test_precision': 1.0, 'test_recall': 1.0, 'test_f1_score': 1.0, 'confusion_matrix': [[1444, 0], [0, 58576]]}


In [6]:
# Decision Tree
run_experiment(
    data_info=[2014, 1],
    model_name="decision_tree",
    params={"max_depth": 5},
    experiment_name="citibike_membership"
)


Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]




🏃 View run decision_tree_20251022_122059 at: http://mlflow:5000/#/experiments/1/runs/441fde457cdc427dad211185c3d84bdb
🧪 View experiment at: http://mlflow:5000/#/experiments/1


{'train_accuracy': 0.9999666773576811,
 'train_f1_score': 0.9999829272873166,
 'test_accuracy': 0.9999666777740753,
 'test_precision': 0.9999658574891598,
 'test_recall': 1.0,
 'test_f1_score': 0.9999829284531472,
 'confusion_matrix': [[1442, 2], [0, 58576]]}

In [7]:
# LightGBM
run_experiment(
    data_info=[2014, 1],
    model_name="lgbm",
    params={"n_estimators": 100, "learning_rate": 0.1, "num_leaves": 31},
    experiment_name="citibike_membership"
)

Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]
[LightGBM] [Info] Number of positive: 234295, number of negative: 5782
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 631
[LightGBM] [Info] Number of data points in the train set: 240077, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.975916 -> initscore=3.701831
[LightGBM] [Info] Start training from score 3.701831




🏃 View run lgbm_20251022_122112 at: http://mlflow:5000/#/experiments/1/runs/89f6bd9003b54f24898ac6e74df146e2
🧪 View experiment at: http://mlflow:5000/#/experiments/1


{'train_accuracy': 1.0,
 'train_f1_score': 1.0,
 'test_accuracy': 0.9999666777740753,
 'test_precision': 0.9999658574891598,
 'test_recall': 1.0,
 'test_f1_score': 0.9999829284531472,
 'confusion_matrix': [[1442, 2], [0, 58576]]}

In [8]:
# XGBoost
run_experiment(
    data_info=[2014, 1],
    model_name="xgboost",
    params={"n_estimators": 100, "learning_rate": 0.1, "max_depth": 5},
    experiment_name="citibike_membership"
)

Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]


  self.get_booster().save_model(fname)
  self.get_booster().load_model(fname)


🏃 View run xgboost_20251022_122124 at: http://mlflow:5000/#/experiments/1/runs/a524d0163c8c4f72855e5dcb53ed99e9
🧪 View experiment at: http://mlflow:5000/#/experiments/1


{'train_accuracy': 0.9999791733485507,
 'train_f1_score': 0.9999893295773445,
 'test_accuracy': 0.9999833388870376,
 'test_precision': 0.9999829284531472,
 'test_recall': 1.0,
 'test_f1_score': 0.9999914641537135,
 'confusion_matrix': [[1443, 1], [0, 58576]]}