### ローカルPCで学習するコード

In [None]:
import json
import random
import os
import pickle
import sys
import warnings
import yaml
from pathlib import Path

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import wandb
from scipy.stats import rankdata
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from wandb.integration.lightgbm import log_summary, wandb_callback

#### コンペに関する情報取得
- path, parameterなどを格納

In [2]:
with open("./config.yaml", "r") as file:
    config_info = yaml.safe_load(file)

In [3]:
class CFG:
    
    ## kaggle_path
    kaggle_api_path = config_info["kaggle_api_path"]
    kaggle_datasets_path = config_info["kaggle_datasets_path"]
    
    ## wandb_path
    wandb_api_path = config_info["wandb_api_path"]
    wandb_logging_path = config_info["wandb_logging_path"]
    
    ## exp_parameter 
    expUser = config_info["expUser"]
    expid = config_info["expID"]
    enviroments = 'local'
    competition_name = config_info["competition_name"]
    
    ## 
    dataPath = config_info["dataPath"]
    train = 'train.csv'
    train_demographics = 'train_demographics.csv'
    test = 'test.csv'
    test_demographics = 'test_demographics.csv'    
    target = ""  
    
    ## model_learning_parameter
    seed = config_info["seed"] ##seed値
    n_splits = config_info["n_splits"]
    test_size = 0.2
    random_state = 42
    learning_rate = 0.1
    num_leaves = 31
    n_estimators = 10000
    feature_fraction = 0.9
    stopping_rounds = 50
    log_evaluation = 100
    objective = 'regression'
    metric =  'rmse',
    features = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"]
    

### API認証

In [4]:
## kaggle-api
def set_kaggle_creds(api_path):
    '''
    apiキーとユーザーのパスを設定すると、kaggleAPIの認証をする
    '''
    kaggle_api_path = api_path
    with open(kaggle_api_path, 'r', encoding='utf-8') as f:
        kaggle_creds = json.load(f)
        
    os.environ["KAGGLE_USERNAME"] = kaggle_creds["username"]
    os.environ["KAGGLE_KEY"] = kaggle_creds["key"]
    
    from kaggle import KaggleApi
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()
    return api

kaggle_api = set_kaggle_creds(CFG.kaggle_api_path)

In [5]:
## wandb-api
def login_wandb(api_path):
    
    with api_path.open('r', encoding='utf-8') as f:
        cfg = yaml.safe_load(f)          # safe_load で実行コード混入を防止 :contentReference[oaicite:2]{index=2}
    
    os.environ['WANDB_API_KEY'] = cfg['wandb']['WANDB_API_KEY']
    wandb.login()

os.environ["WANDB_DIR"] = CFG.wandb_logging_path
wandb_api_path = Path(CFG.wandb_api_path)
login_wandb(wandb_api_path)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mryokuki[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### kaggleデータセット用 初回だけの処理

In [6]:
FLAG = False
if FLAG:
    path = "../results"
    api.dataset_create_new(
        folder=str(path),               # 必須
        public=False,                      # 公開なら True / 下書きなら False
        quiet=False,                      # 進捗バーを表示しないなら True
        convert_to_csv=False,             # CSV 変換不要なら False
        dir_mode="zip"                    # ← CLI の --dir-mode zip と同じ
    )

### CFG設定

In [None]:
#seedをCFG.seed値に固定する
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch_manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends,cudnn.deterministic = True    
    
seed_everything(CFG.seed)

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

In [8]:
INPUT = Path(CFG.dataPath)
print(f'My Enviroment: local PC.')

My Enviroment: local PC.


In [None]:
config = CFG()
print("wandb version:", wandb.__version__)
print("lightgbm version:", lgb.__version__)

# クラスの属性を辞書に変換する関数
def class_to_dict(obj):
    return {k: getattr(obj, k) for k in dir(obj) if not k.startswith('__') and not callable(getattr(obj, k))}

class_to_dict(config)

# WandBの初期化
wandb.init(
    project=f'{CFG.competition_name}_exp{CFG.expUser}',
	config=class_to_dict(config),
 	name = config.expid,
)

# データセットを取得
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.DataFrame(data.target, columns=data.target_names)

X_train, X_test, y_train, y_test = train_test_split(
	df[config.features],
	y,
	test_size=config.test_size,
	random_state=config.random_state
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

# LightGBM用のデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

wandb version: 0.19.9
lightgbm version: 4.6.0


In [10]:
%%wandb

# モデルのパラメータを設定
params = {
    'learning_rate': config.learning_rate,
    'num_leaves': config.num_leaves,
    'objective': config.objective,
    'metric': config.metric,
    'feature_fraction': config.feature_fraction
}


# モデルを学習
model = lgb.train(
    params,
    train_data,
    num_boost_round=config.n_estimators,
    callbacks = [
        lgb.early_stopping(stopping_rounds=config.stopping_rounds, verbose=True),
        lgb.log_evaluation(config.log_evaluation),
        wandb_callback()
    ],
    valid_sets=[train_data, test_data],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.39261	valid_1's rmse: 0.461229
[200]	training's rmse: 0.339785	valid_1's rmse: 0.447667
[300]	training's rmse: 0.305127	valid_1's rmse: 0.442757
[400]	training's rmse: 0.278112	valid_1's rmse: 0.439342
[500]	training's rmse: 0.255133	valid_1's rmse: 0.438417
[600]	training's rmse: 0.235924	valid_1's rmse: 0.437065
Early stopping, best iteration is:
[622]	training's rmse: 0.231847	valid_1's rmse: 0.436778


In [11]:
# テストデータで予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# モデルの評価
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"RMSE: {rmse}")

# 学習のサマリーをWandBに記録
log_summary(model, save_model_checkpoint=True)

# 終了
wandb.finish()

RMSE: 0.4367780679358385


0,1
iteration,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇██
training_rmse,█▆▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
valid_1_rmse,█▆▆▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,622
iteration,671


### training

In [12]:
exp_dir = Path(f"{CFG.kaggle_datasets_path}/exp{CFG.expid}")
exp_dir.mkdir(parents=True, exist_ok=True)
with open(exp_dir / "model.pkl", "wb") as f:
    pickle.dump(model, f)
print(f"✅ {exp_dir / 'model.pkl'} を作成しました")

✅ ../../results/exp001/model.pkl を作成しました


### 学習モデルのアップロード

In [13]:
df = pd.DataFrame({
    "epoch": [1, 2, 3],
    "accuracy": [0.76, 0.81, 0.85],
    "loss": [0.5, 0.35, 0.28]
})
df.to_csv(exp_dir / "results.csv", index=False)
print(f"✅ {exp_dir / 'results.csv'} を作成しました")

✅ ../../results/exp001/results.csv を作成しました


In [14]:
new_url = kaggle_api.dataset_create_version(
    folder=str(CFG.kaggle_datasets_path),
    version_notes=f"Add {CFG.expid}",  
    dir_mode="zip",
    quiet=False
)

Starting upload for file exp000-ready.zip


100%|██████████| 602k/602k [00:01<00:00, 484kB/s]


Upload successful: exp000-ready.zip (602KB)
Starting upload for file exp001.zip


100%|██████████| 602k/602k [00:01<00:00, 414kB/s]


Upload successful: exp001.zip (602KB)


In [15]:
new_url

{"ref": "ryokuki/cdb-datasets-rk", "url": "https://www.kaggle.com/ryokuki/cdb-datasets-rk", "status": "Ok", "error": "", "invalidTags": []}

done