In [None]:
# そのイベントで最初にヒットしたセンサーの座標のみを特徴量として学習させる、シンプルなモデル

In [2]:
import pandas as pd

batch = pd.read_parquet("batch_1.parquet")
meta = pd.read_parquet("train_meta.parquet")
sensor_geo = pd.read_csv("sensor_geometry.csv")

In [3]:
meta_batch1 = meta[meta["batch_id"] == 1].reset_index(drop=True)

In [4]:
print(meta_batch1.head())

   batch_id  event_id  first_pulse_index  last_pulse_index   azimuth    zenith
0         1        24                  0                60  5.029555  2.087498
1         1        41                 61               111  0.417742  1.549686
2         1        59                112               147  1.160466  2.401942
3         1        67                148               289  5.845952  0.759054
4         1        72                290               351  0.653719  0.939117


In [5]:
print(len(meta_batch1))
print(len(meta))

200000
131953924


In [None]:
first_hits = []

for _, row in meta_batch1.iterrows():
    event_id = row["event_id"]
    start = row["first_pulse_index"]
    end = row["last_pulse_index"]

    event_df = batch.iloc[start:end+1].dropna(subset=["time", "sensor_id"]).reset_index(drop=True)

    if event_df.empty:
        continue

    first_hit_row = event_df.loc[event_df["time"].idxmin()] # 最初に反応したパルスの情報が書かれている行を見つける
    sensor_id = int(first_hit_row["sensor_id"]) # その行のセンサーidを抜き出す

    first_hits.append({
        "event_id": int(event_id),
        "sensor_id": sensor_id
    })

first_hit_df = pd.DataFrame(first_hits)
print(first_hit_df.head()) # そのイベントで最初に反応したセンサーのidがわかるようになった


   event_id  sensor_id
0        24       3918
1        41        458
2        59       4685
3        67       5060
4        72       1022


In [None]:
first_hit_with_xyz = first_hit_df.merge(sensor_geo, on="sensor_id", how="left") # センサーidの座標をマージ

In [8]:
print(first_hit_with_xyz.head())

   event_id  sensor_id       x       y       z
0        24       3918  303.41  335.64  206.58
1        41        458 -211.35 -404.48 -148.16
2        59       4685   31.25  -72.93  138.23
3        67       5060   -9.68  -79.50 -226.50
4        72       1022   79.41 -248.24  464.86


In [None]:
# xyz座標をazimuth_zenithに変換する関数
import numpy as np

def cartesian_to_spherical(x, y, z):
    r = np.sqrt(x**2 + y**2 + z**2)
    azimuth = np.arctan2(y, x)         # [-π, π]
    zenith = np.arccos(z / r)          # [0, π]
    return azimuth, zenith


In [10]:
# first_hit_with_xyz には x, y, z 列がある前提
azimuths, zeniths = cartesian_to_spherical(
    first_hit_with_xyz["x"].values,
    first_hit_with_xyz["y"].values,
    first_hit_with_xyz["z"].values,
)

first_hit_with_xyz["sensor_azimuth"] = azimuths
first_hit_with_xyz["sensor_zenith"] = zeniths


In [11]:
print(first_hit_with_xyz.head())

   event_id  sensor_id       x       y       z  sensor_azimuth  sensor_zenith
0        24       3918  303.41  335.64  206.58        0.835790       1.142484
1        41        458 -211.35 -404.48 -148.16       -2.052299       1.884711
2        59       4685   31.25  -72.93  138.23       -1.165971       0.521078
3        67       5060   -9.68  -79.50 -226.50       -1.691961       2.801727
4        72       1022   79.41 -248.24  464.86       -1.261191       0.510997


In [12]:
# ラベルだけ抜き出しておく（速度・安全性のため）
labels = meta[["event_id", "azimuth", "zenith"]]

# マージ（event_idでくっつける）
data_for_training = first_hit_with_xyz.merge(labels, on="event_id", how="left")


In [13]:
print(data_for_training.head())

   event_id  sensor_id       x       y       z  sensor_azimuth  sensor_zenith  \
0        24       3918  303.41  335.64  206.58        0.835790       1.142484   
1        41        458 -211.35 -404.48 -148.16       -2.052299       1.884711   
2        59       4685   31.25  -72.93  138.23       -1.165971       0.521078   
3        67       5060   -9.68  -79.50 -226.50       -1.691961       2.801727   
4        72       1022   79.41 -248.24  464.86       -1.261191       0.510997   

    azimuth    zenith  
0  5.029555  2.087498  
1  0.417742  1.549686  
2  1.160466  2.401942  
3  5.845952  0.759054  
4  0.653719  0.939117  


In [None]:
data_for_training = data_for_training.drop(columns=["x", "y", "z"]) # xyz座標は必要ないので落とす


In [15]:
print(data_for_training.head())

   event_id  sensor_id  sensor_azimuth  sensor_zenith   azimuth    zenith
0        24       3918        0.835790       1.142484  5.029555  2.087498
1        41        458       -2.052299       1.884711  0.417742  1.549686
2        59       4685       -1.165971       0.521078  1.160466  2.401942
3        67       5060       -1.691961       2.801727  5.845952  0.759054
4        72       1022       -1.261191       0.510997  0.653719  0.939117


In [16]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
# 特徴量
X = data_for_training[["sensor_azimuth", "sensor_zenith"]]

# ターゲット
y_azimuth = data_for_training["azimuth"]
y_zenith = data_for_training["zenith"]

# 学習用・検証用に分割
X_train, X_val, y_az_train, y_az_val = train_test_split(X, y_azimuth, test_size=0.2, random_state=42)
_, _, y_ze_train, y_ze_val = train_test_split(X, y_zenith, test_size=0.2, random_state=42)


In [18]:
# azimuth モデル
model_az = lgb.LGBMRegressor()
model_az.fit(X_train, y_az_train)

# zenith モデル
model_ze = lgb.LGBMRegressor()
model_ze.fit(X_train, y_ze_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 401
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 2
[LightGBM] [Info] Start training from score 3.140878
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 401
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 2
[LightGBM] [Info] Start training from score 1.534628


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:
# azimuth モデル
model_az = lgb.LGBMRegressor()
model_az.fit(X_train, y_az_train)

# zenith モデル
model_ze = lgb.LGBMRegressor()
model_ze.fit(X_train, y_ze_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 401
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 2
[LightGBM] [Info] Start training from score 3.140878
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 401
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 2
[LightGBM] [Info] Start training from score 1.534628


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [21]:
pred_az = model_az.predict(X_val)
pred_ze = model_ze.predict(X_val)

# 単純な MSE（後で角度誤差に変えてもOK）
rmse_az = mean_squared_error(y_az_val, pred_az)
rmse_ze = mean_squared_error(y_ze_val, pred_ze)

print(f"Azimuth RMSE: {rmse_az:.4f}")
print(f"Zenith RMSE: {rmse_ze:.4f}")


Azimuth RMSE: 3.2961
Zenith RMSE: 0.4764
