In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

TARGETS = ["Y1", "Y2"]
ID_COL = "id"
TIME_COL = "time"
FEATURE_COLS = None 
SEED = 42 

train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

train_new = pd.read_csv('./data/train_new.csv')
test_new = pd.read_csv('./data/test_new.csv')

train_data = pd.concat([train_data, train_new[['O', 'P']].reset_index(drop=True)], axis=1)
test_data = pd.concat([test_data, test_new[['O', 'P']].reset_index(drop=True)], axis=1)

if FEATURE_COLS is None: 
    exclude = {TIME_COL, ID_COL, *TARGETS}
    FEATURE_COLS = [c for c in train_data.columns if c not in exclude]
    
train = train_data.sort_values(TIME_COL).reset_index(drop=True)
test = test_data.sort_values(TIME_COL).reset_index(drop=True)

def make_lag_features(df, cols, lags=[1,2,3,5,10]):
    lag_dict = {}
    for c in cols:
        for lag in lags:
            lag_dict[f"{c}_lag{lag}"] = df[c].shift(lag)
            lag_df = pd.DataFrame(lag_dict)
            df = pd.concat([df, lag_df], axis=1)
            return df
        
def make_rolling_features(df, cols, windows=[3,5,10]):
    roll_dict = {}
    for c in cols:
        s = df[c].shift(1)
        for w in windows: 
            roll_dict[f"{c}_roll_mean_(w)"] = s.rolling(window=w, min_periods=1).mean()
            roll_dict[f"{c}_roll_std_(w)"] = s.rolling(window=w, min_periods=1).std().fillna(0)
            roll_dict[f"{c}_roll_min_(w)"] = s.rolling(window=w, min_periods=1).min()
            roll_dict[f"{c}_roll_max_(w)"] = s.rolling(window=w, min_periods=1).max()
    roll_df = pd.DataFrame(roll_dict)
    df = pd.concat([df, roll_df], axis=1)
    return df

def make_diff_features(df, cols):
    diff_dict = {}
    for c in cols: 
        diff_dict[f"{c}_diff1"] = df[c] - df[c].shift(1)
        diff_dict[f"{c}_pct_chnge1"] = df[c].pct_change(fill_method=None).replace([np.inf, -np.inf], 0)
    diff_df = pd.DataFrame(diff_dict)
    df = pd.concat([df, diff_df], axis=1)
    return df 

train["is_train"] = 1
test["is_train"] = 0
test_id = test[ID_COL]

combined = pd.concat([train.drop(columns=TARGETS), test], axis=0, ignore_index=True)
combined = combined.sort_values(TIME_COL).reset_index(drop=True)

combined = make_lag_features(combined, FEATURE_COLS)
combined = make_rolling_features(combined, FEATURE_COLS)
combined = make_diff_features(combined, FEATURE_COLS)

train_fe = combined[combined["is_train"]==1].reset_index(drop=True)
test_fe = combined[combined["is_train"]==0].reset_index(drop=True)

train_fe[TARGETS[0]] = train[TARGETS[0]].values 
train_fe[TARGETS[1]] = train[TARGETS[1]].values 

train_fe = train_fe.drop(columns=["is_train"])
test_fe = test_fe.drop(columns=["is_train"])

drop_cols = [TIME_COL, ID_COL] if ID_COL in train_fe.columns else [TIME_COL]
features = [c for c in train_fe.columns if c not in drop_cols + TARGETS]

train_fe["O_missing"] = train_fe["O"].isna().astype(int)
train_fe["P_missing"] = train_fe["P"].isna().astype(int)
test_fe["O_missing"] = train_fe["O"].isna().astype(int)
test_fe["P_missing"] = train_fe["P"].isna().astype(int)

fill_features = [f for f in features if f not in ["O", "P"]]
medians = train_fe[fill_features].median()
train_fe[fill_features] = train_fe[fill_features].fillna(medians)
test_fe[fill_features] = test_fe[fill_features].fillna(medians)

final_models = {}
test_preds = {}

for target in TARGETS:
    print(f"\nTraining final model for (target)")
    train_set = lgb.Dataset(train_fe[features], label=train_fe[target])
    
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "num_leaves": 128,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "lambda_l1": 0.5,
        "lambda_l2": 0.5,
        "min_data_in_leaf": 20,
        "seed": SEED,
        "verbose": -1
    }
    
    clf_full = lgb.train(
        params,
        train_set,
        num_boost_round=2000  
    )
    
    final_models[target] = clf_full     
    
    train_preds = clf_full.predict(train_fe[features])
    rmse = np.sqrt(mean_squared_error(train_fe[target], train_preds))
    mae = mean_absolute_error(train_fe[target], train_preds)
    r2 = r2_score(train_fe[target], train_preds)
    print(f"[Train] {target} -> RMSE: {rmse:.6f}, MAE: {mae:.6f}, R2: {r2:.6f}")
    
    test_preds[target] = clf_full.predict(test_fe[features])
    
submission = pd.DataFrame()
submission[ID_COL] = test_id.values 
submission["Y1"] = test_preds["Y1"]
submission["Y2"] = test_preds["Y2"]
submission.to_csv("submission_lightgbm.csv", index=False)
print("Submission generated")

        



Training final model for (target)
[Train] Y1 -> RMSE: 0.264832, MAE: 0.191305, R2: 0.925558

Training final model for (target)
[Train] Y2 -> RMSE: 0.197089, MAE: 0.134287, R2: 0.954473
Submission generated
