In [None]:
import pandas as pd
import numpy as np


train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

In [2]:
features = list(sorted(set(test_data.columns.tolist()) - {"time", "id"}))

In [None]:
# Basic linreg
from sklearn.linear_model import LinearRegression

preds = test_data[['id']].copy()

m1 = LinearRegression(fit_intercept=True)
m1.fit(train_data[features], train_data["Y1"])
preds["Y1"] = m1.predict(test_data[features])

m2 = LinearRegression(fit_intercept=True)
m2.fit(train_data[features], train_data["Y2"])
preds["Y2"] = m2.predict(test_data[features])

preds.to_csv('preds.csv', index=False)

In [None]:
from tqdm import tqdm
X = train_data[features]
feats = {}
# create moving averages
mas = [10, 100, 1000, 5000]
for ma in mas:
    print("MA", ma)
    for col in X.columns:
        roll = X[col].rolling(window=ma)
        feats[f"ma_{ma}_{col}_mean"] = roll.mean()
        feats[f"ma_{ma}_{col}_std"] = roll.mean()
        feats[f"ma_{ma}_{col}_10pct"] = roll.quantile(0.1)
        feats[f"ma_{ma}_{col}_90pct"] = roll.quantile(0.9)

for ma in [100, 2500]:
    for col in tqdm(X.columns):
        roll = X[col].rolling(window=ma)
        feats[f"lr_coeff_{ma}_{col}"] = roll.agg(lambda x: LinearRegression().fit(np.arange(len(x)).reshape(-1,1), x).coef_)

for col in X.columns:
    feats[f"diff_{col}"] = X[col].diff()
    feats[f"lag_1_{col}"] = X[col].shift(1)

full_feats = pd.DataFrame(feats)

MA 10
MA 100
MA 1000
MA 5000


 71%|███████▏  | 10/14 [04:05<01:37, 24.39s/it]

In [22]:
THRESHOLD = 0.3
for col in full_feats.columns:
    cy1 = full_feats[col].corr(train_data["Y1"])
    cy2 = full_feats[col].corr(train_data["Y2"])
    if abs(cy1) > THRESHOLD or abs(cy2) > THRESHOLD:
        print(col,cy1,cy2)

ma_10_A_90pct 0.008436001186658073 0.3227499243154101
ma_10_B_mean -0.007978240289874311 0.4222829576740693
ma_10_B_std -0.007978240289874311 0.4222829576740693
ma_10_B_10pct 0.0014981250024895783 0.3506606620399282
ma_10_B_90pct -0.009259659577956584 0.41545614503103473
ma_10_C_10pct 0.10470323048198026 -0.43194542270640834
ma_10_C_90pct 0.10654234248568482 0.33589463274654413
ma_10_D_mean 0.0028535063236529543 0.4186756476017388
ma_10_D_std 0.0028535063236529543 0.4186756476017388
ma_10_D_10pct 0.004322950156292821 0.3412753973075849
ma_10_D_90pct 0.002074513802945597 0.4247711746341437
ma_10_E_10pct 0.10350773885182926 -0.3931790047841644
ma_10_E_90pct 0.10739843857731171 0.304740926158251
ma_10_F_mean -0.007276087076783393 0.3312684198739079
ma_10_F_std -0.007276087076783393 0.3312684198739079
ma_10_F_90pct -0.009057679735393735 0.34688820665061015
ma_10_G_10pct 0.11125190807721631 -0.42527242473624766
ma_10_G_90pct 0.12826816257556403 0.3678078277606873
ma_10_H_10pct 0.10338831344

In [23]:
import optuna
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error

TARGET = "Y1"
X = pd.concat((train_data[features], full_feats), axis=1)
print(X.shape)
y = train_data[TARGET]

CUTOFF = int(len(X) * 0.8)

x_train, y_train = X.iloc[:CUTOFF], y.iloc[:CUTOFF]
x_test, y_test = X.iloc[CUTOFF:], y.iloc[CUTOFF:]


def lgbm_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 300),
        "num_leaves": trial.suggest_int("num_leaves", 3, 100),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_sample", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "verbosity": -1
    }
    model = LGBMRegressor(**params)
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    return r2_score(y_test, pred)

study = optuna.create_study(direction="maximize")
study.optimize(lgbm_objective, n_trials=50)

print(study.best_trial.params)
print(study.best_value)

[I 2025-09-20 12:15:50,759] A new study created in memory with name: no-name-7152c793-04b4-45f5-8bf5-35b54ce4d45d


(80000, 266)


[I 2025-09-20 12:15:54,402] Trial 0 finished with value: 0.710998818952956 and parameters: {'n_estimators': 159, 'num_leaves': 78, 'max_depth': 15, 'min_child_sample': 80, 'subsample': 0.9391143207807309, 'colsample_bytree': 0.7634642476793414, 'reg_alpha': 4.35861478151366e-07, 'reg_lambda': 0.0016252210203816174}. Best is trial 0 with value: 0.710998818952956.
[I 2025-09-20 12:15:56,376] Trial 1 finished with value: 0.7137012692387674 and parameters: {'n_estimators': 188, 'num_leaves': 21, 'max_depth': 8, 'min_child_sample': 80, 'subsample': 0.7107109583528362, 'colsample_bytree': 0.860328396086419, 'reg_alpha': 6.096083619365188e-06, 'reg_lambda': 1.401971121146837e-08}. Best is trial 1 with value: 0.7137012692387674.
[I 2025-09-20 12:15:58,304] Trial 2 finished with value: 0.7129377927791534 and parameters: {'n_estimators': 212, 'num_leaves': 17, 'max_depth': 15, 'min_child_sample': 62, 'subsample': 0.8648290030776753, 'colsample_bytree': 0.9106510184692627, 'reg_alpha': 2.09311333

{'n_estimators': 108, 'num_leaves': 56, 'max_depth': 11, 'min_child_sample': 58, 'subsample': 0.685805840082756, 'colsample_bytree': 0.30109076945907953, 'reg_alpha': 8.365066827190633, 'reg_lambda': 4.893466471894359}
0.7219560538707834
