In [1]:
import os
import pandas as pd 
import numpy as np

from sklearn.metrics import r2_score

In [2]:
base_dir = "submissions"

cv_df = pd.DataFrame()
name = []
cv = []

for filename in os.listdir(base_dir):
    if "oof" in filename:
        oof_df = pd.read_csv(os.path.join(base_dir, filename))
        cv_score = r2_score(oof_df["y"], oof_df["y_pred"])
        trimmed_name = filename[:-8]
        # if len(oof_df) != 1118:
        #     continue
        name.append(trimmed_name)
        cv.append(cv_score)

cv_df["name"] = name
cv_df["cv"] = cv
cv_df = cv_df.sort_values(by=["cv"], ignore_index=True, ascending=False)
cv_df

Unnamed: 0,name,cv
0,2022_11_14-02:29:06,0.648379
1,2022_11_14-01:09:17,0.64675
2,2022_11_13-11:44:32,0.645505
3,2022_11_14-01:55:20,0.64472
4,2022_11_13-11:54:25,0.643482
5,2022_11_14-12:18:48,0.643482
6,2022_11_14-12:24:47,0.630456
7,2022_11_14-12:46:47,0.571927


In [3]:
oof = [pd.read_csv(os.path.join(base_dir, f"{name}.oof.csv")) for name in cv_df["name"]]
num_oof = len(oof)
num_pred = len(oof[0])
oof_preds = np.zeros((num_oof, num_pred))
oof_target = oof[0]["y"]
oof_cvs = [r2_score(oof_target, df["y_pred"]) for df in oof]

for i in range(num_oof):
    oof_preds[i, :] = oof[i]["y_pred"]

## Naive ensemble (average of predictions)

In [4]:
subs = [pd.read_csv(os.path.join(base_dir, f"{name}.csv")) for name in cv_df["name"]]
num_test = len(subs[0])
num_sub = len(subs)
sub_pred = np.zeros((num_test, num_sub))

for i in range(num_sub):
    sub_pred[:, i] = subs[i]["y"]

sub_pred.shape

(776, 8)

In [5]:
np.mean(sub_pred, axis=1).shape

(776,)

In [6]:
sub = pd.DataFrame()
sub["id"] = subs[0].id
sub["y"] = np.mean(sub_pred, axis=1)
sub.to_csv("submissions/avg_ensemble.csv", index=False)

## Ensemble using hill climbing

In [7]:
models = [0]
weights = []
cur_best_cv = oof_cvs[0]

for i in range(num_oof):
    
    y_ensemble = oof_preds[models[0], :]
    for k, j in enumerate(models[1:]):
        y_ensemble = weights[k] * oof_preds[j, :] + (1 - weights[k]) * y_ensemble
    
    best_r2 = 0
    best_model = 0
    best_weight = 0
    for j in range(num_oof):
        # If model is already in the pool, skip
        if j in models:
            continue
        
        j_best_r2 = 0
        j_best_weight = 0
        for weight in np.linspace(0, 1.0, num=200, endpoint=False):
            ensembled = weight * oof_preds[j, :] + (1 - weight) * y_ensemble
            score = r2_score(oof_target, ensembled)
            if score > j_best_r2:
                j_best_r2 = score
                j_best_weight = weight
        
        if j_best_r2 > best_r2:
            best_r2 = j_best_r2
            best_model = j
            best_weight = j_best_weight
        
    # print(best_r2, best_model, best_weight)
    # break

    improvement = best_r2 - cur_best_cv
    if improvement < 0.0001:
        print("Can no longer improve. Stop!")
        break

    print(f"Added model {best_model} to the ensemble, CV improved from {cur_best_cv} to {best_r2}! (delta: {improvement})")
    models.append(best_model)
    weights.append(best_weight)
    cur_best_cv = best_r2
    

Added model 1 to the ensemble, CV improved from 0.6483792867801517 to 0.6635961409709288! (delta: 0.015216854190777118)
Added model 7 to the ensemble, CV improved from 0.6635961409709288 to 0.6639761975679256! (delta: 0.00038005659699680283)
Can no longer improve. Stop!


In [8]:
print("Models used in the ensemble", models)
print("Weights", weights)

Models used in the ensemble [0, 1, 7]
Weights [0.485, 0.06]


## Combine submissions accordingly

In [9]:
preds = []

for model_id in models:
    name = cv_df.loc[model_id, "name"]
    sub_df = pd.read_csv(os.path.join(base_dir, name + ".csv"))
    preds.append(sub_df["y"])

ensemble_pred = preds[0]
for i in range(len(models)-1):
    ensemble_pred = weights[i] * preds[i+1] + (1 - weights[i]) * ensemble_pred

In [42]:
def round_to_nearest_int(x):
    decimal = x % 1
    mask = abs(x - np.round(x)) <= 0.4
    x[mask] = np.round(x[mask])
    return x

In [43]:
sub = pd.DataFrame()
sub["id"] = pd.read_csv(os.path.join(base_dir, cv_df["name"][0] + ".csv")).id
sub["y"] = round_to_nearest_int(ensemble_pred)
sub.to_csv("submissions/hc_ensemble.csv", index=False)