In [45]:
import os
import pandas as pd 
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.linear_model import Ridge, Lasso, LinearRegression, BayesianRidge
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from sklearn.pipeline import Pipeline

In [70]:
base_dir = "submissions"

cv_df = pd.DataFrame()
name = []
cv = []

for filename in os.listdir(base_dir):
    if "oof" in filename:
        oof_df = pd.read_csv(os.path.join(base_dir, filename))
        cv_score = r2_score(oof_df["y"], oof_df["y_pred"])
        trimmed_name = filename[:-8]
        name.append(trimmed_name)
        cv.append(cv_score)

cv_df["name"] = name
cv_df["cv"] = cv
cv_df = cv_df.sort_values(by=["cv"], ignore_index=True, ascending=False)
cv_df = cv_df[:5]
cv_df

Unnamed: 0,name,cv
0,2022_11_14-02:29:06,0.648379
1,2022_11_14-01:09:17,0.64675
2,2022_11_13-11:44:32,0.645505
3,2022_11_14-01:55:20,0.64472
4,2022_11_13-11:54:25,0.643482


In [71]:
oof = [pd.read_csv(os.path.join(base_dir, f"{name}.oof.csv")) for name in cv_df["name"]]
num_oof = len(oof)
num_pred = len(oof[0])
oof_preds = np.zeros((num_pred, num_oof))
oof_target = oof[0]["y"]
oof_cvs = [r2_score(oof_target, df["y_pred"]) for df in oof]

for i in range(num_oof):
    oof_preds[:, i] = oof[i]["y_pred"]

oof_preds.shape

(1111, 5)

In [72]:
best_cv = 0
best_alpha = 0

for alpha in np.linspace(0.1, 10000, num=100):
    model = Ridge(alpha=alpha)
    scores = cross_val_score(model, oof_preds, oof_target, scoring="r2", cv=5)
    cv = np.mean(scores)
    print(alpha, cv)
    if best_cv < cv:
        best_cv = cv
        best_alpha = alpha

print("Best alpha =", best_alpha, f"(CV:{best_cv})")

0.1 0.6421598921388447
101.1090909090909 0.6555205693967701
202.1181818181818 0.6564331126483673
303.1272727272727 0.6568907834984585
404.1363636363636 0.657184598398802
505.1454545454545 0.6573940289083916
606.1545454545454 0.6575521963598469
707.1636363636363 0.6576760015878664
808.1727272727272 0.6577751993672131
909.1818181818181 0.6578559183740176
1010.190909090909 0.6579222585943313
1111.1999999999998 0.6579770994958238
1212.2090909090907 0.6580225433152764
1313.2181818181816 0.6580601740434268
1414.2272727272725 0.6580912166656402
1515.2363636363634 0.6581166393254559
1616.2454545454543 0.6581372212793564
1717.2545454545452 0.6581535995280653
1818.2636363636361 0.6581663016984763
1919.272727272727 0.6581757697934632
2020.281818181818 0.6581823777152267
2121.290909090909 0.6581864444428117
2222.2999999999997 0.658188244111902
2323.3090909090906 0.6581880138441145
2424.3181818181815 0.6581859599127753
2525.3272727272724 0.6581822626595156
2626.3363636363633 0.6581770804591962
2727

In [73]:
subs = [pd.read_csv(os.path.join(base_dir, f"{name}.csv")) for name in cv_df["name"]]
num_test = len(subs[0])
num_sub = len(subs)
sub_pred = np.zeros((num_test, num_sub))

for i in range(num_sub):
    sub_pred[:, i] = subs[i]["y"]

sub_pred.shape

(776, 5)

In [78]:
model = Ridge(alpha=2222)
model = model.fit(oof_preds, oof_target)
r2_score(oof_target, model.predict(oof_preds))

0.6625737073294304

In [76]:
def round_to_nearest_int(x):
    decimal = x % 1
    mask = abs(x - np.round(x)) <= 0.2
    x[mask] = np.round(x[mask])
    return x

In [77]:
sub = pd.DataFrame()
sub["id"] = subs[0].id
sub["y"] = round_to_nearest_int(model.predict(sub_pred))
sub.to_csv("submissions/stacking.csv", index=False)