In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 200)

In [None]:
df_train = pd.read_parquet("../data/train.parquet")

In [None]:
print(df_train.columns.tolist())
df_train.head().T

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# ---- Step 1: サンプリング（20%）
df_train_small = df_train.sample(frac=0.2, random_state=42)

# ---- Step 2: 特徴量と目的変数の分離
features = [
    'pickup_longitude',
    'pickup_latitude',
    'dropoff_longitude',
    'dropoff_latitude',
    'passenger_count'
]
target = 'fare_amount'

X = df_train_small[features]
y = df_train_small[target]

# ---- Step 3: 学習用・検証用に分割（80:20）
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---- Step 4: LightGBM用データセット
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# ---- Step 5: モデル学習
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'random_state': 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=1000,
    early_stopping_rounds=50,
    verbose_eval=100
)

# ---- Step 6: 検証スコア（RMSE）
y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")
