# Imports

In [None]:
import math
import os
from pathlib import Path
import random
import gc
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import *
from sklearn.linear_model import *

# Constants

In [None]:
FOLDS = 5
SEED = 23
DEBUG = False

# Seeding

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

set_seed(SEED)

# Read data

In [None]:
train_df = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")

pressure_values = np.array(sorted(train_df["pressure"].unique().tolist()))

if DEBUG:
    train_df = train_df[:80*10000]
    test_df = test_df[:80*10000]

sub = test_df[["id"]]    
    
pressure_values.shape

In [None]:
u_out_0 = train_df['u_out'].to_numpy().reshape(-1, 80)
targets = train_df[['pressure']].to_numpy().reshape(-1, 80)
test_indices = test_df.index

In [None]:
def add_preds(model_path, train, test, n_folds, with_discrete=True, size=80, preds_folder='preds'):
    num_features_to_add = 1
    if with_discrete:
        num_features_to_add = 2
    train_1 = np.zeros((train.shape[0], train.shape[1], train.shape[2] + num_features_to_add))
    test_1 = np.zeros((test.shape[0], test.shape[1], test.shape[2] + num_features_to_add))
    
    test_preds = []
    test_preds_discrete = []

    for fold in range(n_folds):
        oof_idx = np.load(f'{model_path}/indices/val_{fold}.npy')
        oof_preds = np.load(f'{model_path}/{preds_folder}/val_pred_fold_{fold}.npy')
        oof_preds = oof_preds[np.mod(np.arange(len(oof_preds)), size) < train.shape[1]]
        oof_preds = oof_preds.reshape(-1, train.shape[1], 1)
        
        test_pred = np.load(f'{model_path}/{preds_folder}/test_pred_fold_{fold}.npy')
        test_pred = test_pred[np.mod(np.arange(len(test_pred)), size) < train.shape[1]]
        test_preds.append(test_pred)
        
        if with_discrete:
            oof_preds_discrete = np.load(f'{model_path}/{preds_folder}/val_pred_fold_{fold}_discrete.npy')
            oof_preds_discrete = oof_preds_discrete[np.mod(np.arange(len(oof_preds_discrete)), size) < train.shape[1]]
            oof_preds_discrete = oof_preds_discrete.reshape(-1, train.shape[1], 1)

            test_pred_discrete = np.load(f'{model_path}/{preds_folder}/test_pred_fold_{fold}_discrete.npy')
            test_pred_discrete = test_pred_discrete[np.mod(np.arange(len(test_pred_discrete)), size) < train.shape[1]]
            test_preds_discrete.append(test_pred_discrete)
            
            train_1[oof_idx] = np.c_[train[oof_idx], oof_preds, oof_preds_discrete]
        else:
            train_1[oof_idx] = np.c_[train[oof_idx], oof_preds]  
            

    test_preds = np.median(np.vstack(test_preds),axis=0).reshape(-1, train.shape[1], 1)
    test_1 = np.c_[test, test_preds]
    
    if with_discrete:
        test_preds_discrete = np.median(np.vstack(test_preds_discrete),axis=0).reshape(-1, train.shape[1], 1)
        test_1 = np.c_[test_1, test_preds_discrete]

    return train_1, test_1

In [None]:
model_path = '../input/new-model-same-old-mistakes/'
m_list = [
          ('i_think_i_might_have_built_a_model_v10', 80, 'preds'), ('i_think_i_might_have_built_a_model_v14', 80, 'preds'),
          ('i_think_i_might_have_built_a_model_v18_19', 80, 'preds'), ('i_think_i_might_have_built_a_model_v16', 80, 'preds'),
          ('i_think_i_might_have_built_a_model_v16', 80, 'preds_kaggle'), ('i_think_i_might_have_built_a_model_v16', 80, 'preds_colab'),
          ('i_think_i_might_have_built_a_model_v17_21_23_24', 80, 'preds'), ('i_think_i_might_have_built_a_model_colab_seed1', 80, 'preds'), 
          ('i_think_i_might_have_built_a_model_v20_27_seed2', 80, 'preds'), ('i_think_i_might_have_built_a_model_v29_31_seed5', 80, 'preds')
]

train = np.empty(shape=(train_df.shape[0] // 80, 80, 0))
test = np.empty(shape=(test_df.shape[0] // 80, 80, 0))

with_discrete = True

for m in m_list:
    train, test = add_preds(model_path + m[0], train, test, FOLDS, with_discrete=with_discrete, size=m[1], preds_folder=m[2])

train.shape, test.shape

In [None]:
tr = train.reshape(-1, train.shape[2])
u_out_0_flat = u_out_0.ravel()
u_out_0_flat = list(map(bool, u_out_0_flat))
u_out_0_flat = ~np.array(u_out_0_flat)

i = 0
for m in m_list: 
    if with_discrete:
        pred = tr.T[i*2]
        mae = mean_absolute_error(targets.ravel()[u_out_0_flat], pred[u_out_0_flat])
        pred_discrete = tr.T[i*2 + 1]
        mae_discrete = mean_absolute_error(targets.ravel()[u_out_0_flat], pred_discrete[u_out_0_flat])
        print(f"OOF {m[0]}, {m[1]}, {m[2]} || MAE score (discrete, u_out==0): {mae_discrete:.6f} || MAE score (non-discrete, u_out==0): {mae:.6f}")
    else:
        pred = tr.T[i]
        mae = mean_absolute_error(targets.ravel()[u_out_0_flat], pred[u_out_0_flat])
        print(f"OOF {m[0]}, {m[1]}, {m[2]} || MAE score (non-discrete, u_out==0): {mae:.6f}")
    i += 1

# Discretization

In [None]:
diff = np.diff(pressure_values)
step = np.median(diff)
step

In [None]:
EXTRAPOLATE_KNOTS = 100

left_pressure_extrapolate = np.arange(pressure_values[0] - EXTRAPOLATE_KNOTS* step, pressure_values[0] - step, step)
right_pressure_extrapolate = np.arange(pressure_values[-1] + step, pressure_values[-1] + EXTRAPOLATE_KNOTS* step, step)

pressure_values_extra = np.concatenate([left_pressure_extrapolate, pressure_values, right_pressure_extrapolate])
pressure_values_extra.shape

pressure_values_extra_mid_points = (pressure_values_extra[1:] + pressure_values_extra[:-1]) / 2
pressure_values_extra_mid_points.shape

del diff
del left_pressure_extrapolate
del right_pressure_extrapolate
del pressure_values
gc.collect()

In [None]:
def discretize_np(y_discr, y_midpoints, y_cont):
    indices = np.searchsorted(y_midpoints, y_cont, side="left")
    result = y_discr[indices]
    return result

# Ensembling

In [None]:
k_fold = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

oof_preds = []
oof_preds_discrete = []
oof_targets = []

test_preds = []
test_preds_discrete = []
for fold, (train_idx, val_idx) in enumerate(k_fold.split(train, targets)):            
    print(f"FOLD={fold} started")
    
    X_train, X_val = train[train_idx], train[val_idx]
    y_train, y_val = targets[train_idx], targets[val_idx]
        
    u_out_0_val = u_out_0[val_idx]    
    u_out_0_val_flat = u_out_0_val.ravel()
    u_out_0_val_flat = list(map(bool, u_out_0_val_flat))
    u_out_0_val_flat = ~np.array(u_out_0_val_flat)
    
    y_val_flat = y_val.ravel()
        
    ######## Linear model
    
    X_train, y_train = X_train.reshape(-1, train.shape[2]), y_train.ravel()
    X_val, y_val = X_val.reshape(-1, train.shape[2]), y_val.ravel()
    test = test.reshape(-1, train.shape[2])
    
    print(X_train.shape, y_train.shape)
    print(X_val.shape, y_val.shape)

    model = LinearRegression()
    model.fit(X_train, y_train)
    print(f'Ensemble Weights: {model.coef_}')
    print(f'Sum of weights: {np.sum(model.coef_)}')

    ########
    
    print(f"FOLD={fold} started train prediction")
    train_pred = model.predict(X_train).ravel()
    train_pred_discrete = discretize_np(pressure_values_extra, pressure_values_extra_mid_points, train_pred)
    
    print(f"FOLD={fold} started validation prediction")
    val_pred = model.predict(X_val).ravel()
    oof_preds.append(val_pred[u_out_0_val_flat].tolist())
    val_pred_discrete = discretize_np(pressure_values_extra, pressure_values_extra_mid_points, val_pred)
    oof_preds_discrete.append(val_pred_discrete[u_out_0_val_flat])
    oof_targets.append(y_val_flat[u_out_0_val_flat])
    val_mae = mean_absolute_error(y_val_flat[u_out_0_val_flat], val_pred[u_out_0_val_flat])
    val_mae_discrete = mean_absolute_error(y_val_flat[u_out_0_val_flat], val_pred_discrete[u_out_0_val_flat])
    print(f"FOLD={fold} | MAE score (discrete, u_out==0): {val_mae_discrete:.6f},  MAE score (non-discrete, u_out==0): {val_mae:.6f}")

    print(f"FOLD={fold} started test prediction")
    test_pred = model.predict(test).ravel()
    test_preds.append(test_pred)
    test_pred_discrete = discretize_np(pressure_values_extra, pressure_values_extra_mid_points, test_pred)
    test_preds_discrete.append(test_pred_discrete)

    print(f"FOLD={fold} finished")

In [None]:
oof_preds = np.hstack(oof_preds)
oof_preds_discrete = np.hstack(oof_preds_discrete)
oof_targets = np.hstack(oof_targets)

oof_mae = mean_absolute_error(oof_targets, oof_preds)
oof_mae_discrete = mean_absolute_error(oof_targets, oof_preds_discrete)
print(f"OOF | MAE score (discrete, u_out==0): {oof_mae_discrete:.6f},  MAE score (non-discrete, u_out==0): {oof_mae:.6f}")

In [None]:
sub['pressure'] = 0

# ENSEMBLE FOLDS WITH MEAN
sub.loc[test_indices, 'pressure'] = sum(test_preds) / len(test_preds)
sub[["id", "pressure"]].to_csv("submission_mean.csv", index=False)

# ENSEMBLE FOLDS WITH MEDIAN
sub.loc[test_indices, 'pressure'] = np.median(np.vstack(test_preds),axis=0)
sub[["id", "pressure"]].to_csv("submission_median.csv", index=False)
sub[["id", "pressure"]]

In [None]:
# ENSEMBLE FOLDS WITH MEAN
sub.loc[test_indices, 'pressure'] = sum(test_preds_discrete) / len(test_preds_discrete)
sub[["id", "pressure"]].to_csv("submission_mean_discrete.csv", index=False)

# ENSEMBLE FOLDS WITH MEDIAN
sub.loc[test_indices, 'pressure'] = np.median(np.vstack(test_preds_discrete),axis=0)
sub[["id", "pressure"]].to_csv("submission_median_discrete.csv", index=False)
sub[["id", "pressure"]]