# モデル調査

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import json
from datetime import datetime
from pathlib import Path
import logging

# ロギング設定
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
from enum import Enum, auto
import sys
sys.path.append('../..')

from src.utils.io import load_month_data
from src.utils.preprocess import preprocess_pipeline


In [3]:
def train_and_log_experiment(
    data_info: list[int],
    model_name: str = "logistic_regression",
    params: dict | None = None,
    test_size: float = 0.2,
    random_state: int = 42,
    experiment_name: str = "citibike_membership",
) -> dict:
    """
    学習とMLflowへのログを行う汎用関数

    Parameters
    ----------
    data_info :list[int]
        学習データ（CSV）取得に必要な情報 (ex）[2014, 1])
    model_name : str
        モデル名（MLflowのrun_nameなどに使用）
    params : dict
        モデルのハイパーパラメータ
    test_size : float
        テストデータ比率
    random_state : int
        乱数シード
    experiment_name : str
        MLflow実験名

    Returns
    -------
    dict : 実験結果のメトリクス
    """
    try:
        logger.info(f"Loading data from {data_info[0]}-{data_info[1]}")

        # データ読み込み
        df_org = load_month_data(*data_info)
        df = preprocess_pipeline(df_org)
        
        X = df.drop("is_member", axis=1)
        y = df["is_member"]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        logger.info(f"Data split: train={len(X_train)}, test={len(X_test)}")

        if params is None:
            params = {"max_iter": 500, "random_state": random_state}

        if model_name == "logistic_regression":
            model = LogisticRegression(**params)
        else:
            raise ValueError(f"Unsupported model: {model_name}")
    
        # MLflow設定
        mlflow.set_experiment(experiment_name)
    
        with mlflow.start_run(
            run_name=f"{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        ) as run:
            logger.info(f"MLflow run started: {run.info.run_id}")
            
            dataset_params = {
                "data_info": data_info,
                "test_size": test_size,
                "random_state": random_state,
                "train_samples": len(X_train),
                "test_samples": len(X_test),
                "n_features": X_train.shape[1],
                "feature_names": X_train.columns.tolist(),
                "class_distribution_train": y_train.value_counts().to_dict(),
                "class_distribution_test": y_test.value_counts().to_dict(),
            }
            
            # パラメータをログ
            mlflow.log_params({
                k: v for k, v in dataset_params.items()
                if k not in ["feature_names", "class_distribution_train", "class_distribution_test"]
            })
            
            # 特徴量名をファイルとして保存
            feature_path = Path("interim/features.json")
            feature_path.parent.mkdir(exist_ok=True, parents=True)
            with open(feature_path, "w") as f:
                json.dump(dataset_params["feature_names"], f, indent=2)           
            mlflow.log_artifact(str(feature_path), "dataset_info")
            
            # クラス分布を記録
            class_dist_path = Path("data/interim/class_distribution.json")
            with open(class_dist_path, "w") as f:
                json.dump({
                    "train": dataset_params["class_distribution_train"],
                    "test": dataset_params["class_distribution_test"]
                }, f, indent=2)
            mlflow.log_artifact(str(class_dist_path), "dataset_info")        
            
            # データセットの追跡（スナップショットを記録）
            try:
                mlflow.log_input(
                    mlflow.data.from_pandas(            # type: ignore
                        df,
                        name=f"citibike_data_{datetime.now().strftime('%Y%m%d')}",
                    ),
                    context="training",
                )
                logger.info("Dataset logged to MLflow")
            except Exception as e:
                logger.warning(f"Failed to log dataset: {e}")
                
            # パラメーターをログ
            mlflow.log_params(params)

            logger.info("Training model...")
            model.fit(X_train, y_train)
          
            y_pred_train = model.predict(X_train)
            y_pred_test = model.predict(X_test)          

            # 確率予測がある場合は記録
            if hasattr(model, "predict_proba"):
                y_proba_test = model.predict_proba(X_test)         
            
            metrics = {
                # テストセットのメトリクス
                "test_accuracy": accuracy_score(y_test, y_pred_test),
                "test_precision": precision_score(y_test, y_pred_test, zero_division=0),
                "test_recall": recall_score(y_test, y_pred_test, zero_division=0),
                "test_f1_score": f1_score(y_test, y_pred_test, zero_division=0),
                # 訓練セットのメトリクス（過学習チェック用）
                "train_accuracy": accuracy_score(y_train, y_pred_train),
                "train_f1_score": f1_score(y_train, y_pred_train, zero_division=0),
            }
            
            # メトリクスを一括ログ
            mlflow.log_metrics(metrics)        
            
            logger.info(f"Test Accuracy: {metrics['test_accuracy']:.4f}")
            logger.info(f"Test F1-Score: {metrics['test_f1_score']:.4f}")
    
            # 混同行列を保存
            cm = confusion_matrix(y_test, y_pred_test)
            cm_path = Path("data/interim/confusion_matrix.json")
            cm_path.parent.mkdir(exist_ok=True, parents=True)
            with open(cm_path, "w") as f:
                json.dump({
                    "matrix": cm.tolist(),
                    "labels": ["Non-Member", "Member"]
                }, f, indent=2)
            mlflow.log_artifact(str(cm_path), "evaluation")
            
            # モデルの保存
            signature = mlflow.models.infer_signature(      # type: ignore
                X_train, 
                model.predict(X_train)
            )

            mlflow.sklearn.log_model(                       # type: ignore
                model,
                "model",
                signature=signature,
                input_example=X_train.iloc[:5],  # サンプル入力例
            )
            
            mlflow.set_tags({
                "model_type": model_name,
                "framework": "sklearn",
                "dataset": data_info,
                "best_metric": "f1_score",
            })
            
            logger.info(f"Experiment logged successfully (run_id={run.info.run_id})")
            
        return metrics

    except Exception as e:
        logger.error(f"Error during experiment: {str(e)}")
        if mlflow.active_run():
            mlflow.end_run(status="FAILED")
        raise

In [4]:
metrics = train_and_log_experiment(
    data_info=[2014, 1],
    model_name="logistic_regression",
    params={
        "max_iter": 1000,
        "C": 1.0,
        "solver": "lbfgs",
        "random_state": 42,
    },
    test_size=0.2,
    random_state=42,
)

print(metrics)

INFO:__main__:Loading data from 2014-1


Loading: [PosixPath('/app/data/raw/2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv')]


INFO:__main__:Data split: train=240077, test=60020
INFO:__main__:MLflow run started: 8aa5540182694226bf33ec993eee90e7
ERROR:__main__:Error during experiment: [Errno 13] Permission denied: '/mlflow'


🏃 View run logistic_regression_20251022_000522 at: http://mlflow:5000/#/experiments/1/runs/8aa5540182694226bf33ec993eee90e7
🧪 View experiment at: http://mlflow:5000/#/experiments/1


PermissionError: [Errno 13] Permission denied: '/mlflow'