In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import copy
import glob
import random
from random import random as rd
import gc

# Reference: 
* https://www.kaggle.com/cdeotte/ensemble-folds-with-median-0-153 by Chris Deotte
* https://www.kaggle.com/snnclsr/a-dummy-approach-to-improve-your-score-postprocess by Sinan Calisir
* Public notebooks with score less than 0.158

In [None]:
df_train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
unique_pressures = df_train["pressure"].unique()
sorted_pressures = np.sort(unique_pressures)
total_pressures_len = len(sorted_pressures)

def find_nearest(prediction):
    insert_idx = np.searchsorted(sorted_pressures, prediction)
    if insert_idx == total_pressures_len:
        return sorted_pressures[-1]
    elif insert_idx == 0:
        return sorted_pressures[0]
    lower_val = sorted_pressures[insert_idx - 1]
    upper_val = sorted_pressures[insert_idx]
    return lower_val if abs(lower_val - prediction) < abs(upper_val - prediction) else upper_val

def set_seed(seed = 2021):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

def wc(input_list):
    l = []
    for i in range(len(input_list)):
        public_lb_score = int(input_list[i].split("/")[-1].split(".")[1].split(" ")[0]) 
        l.append(public_lb_score)
        input_list[i] = (pd.read_csv(input_list[i]).pressure).ravel()
    output = 0
    l_sum = sum(l)
    if len(input_list) == 1:
        output = input_list[0]
    else:
        weight1 = (l[1] / l_sum) + 0.1
        weight2= 1 - weight1
        output += input_list[0] * weight1 + input_list[1] * weight2
    return output

def g(dp):
# input: the dataset path of the prediction result files
# file name format: public lb score or pulbic lb score + name, e.g., 0.335 LSTM baseline

    # get all the files to blend
    l = []
    for i in glob.iglob(f'{dp}/*'):
        l.append(i)
    file_count = len(l)
    loop_time = 500 // file_count
    # calculate the number of files in the input dataset
    # and split them 2 by 2
    splits = file_count // 2
    # sort the file based on their public lb score
    l.sort()
    flist = []
    # create a file list
    # append the 2 by 2 files as one element
    # in the last loop, append all the files which are not necessarily 2 files
    for i in range(splits):
        if i == splits - 1:
            flist.append(l[i * round(len(l) / splits): ])
        else:
            flist.append(l[i * round(len(l) / splits): (i + 1) * round(len(l) / splits)])
    # transfrom each element in the file list into one blended prediction
    for i in range(len(flist)):   
        flist[i] = wc(flist[i])
    pred_list = []
    # loop a large number of times
    # to converge the result into a stable expected value
    for i in range(loop_time):      
        weight = []        
        set_seed(i)
        # create a weight list with the same length as the file list
        for i in range(len(flist)):
            weight.append(rd())  
        weight_sum = sum(weight)
        # normalize the weights
        for i in range(len(weight)):
            weight[i] /= weight_sum
        weight.sort(reverse = True)
        temp = 0
        # assign each weights to each blended prediction
        for i in range(len(flist)):
            temp += flist[i] * weight[i]
        pred_list.append(temp)
        del temp
        gc.collect()
    output = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")
    output.pressure = np.median(np.vstack(pred_list), axis = 0)
    output["pressure"] = output["pressure"].apply(find_nearest)
    output.to_csv(f'rwb {loop_time} loops.csv',index=False)

In [None]:
g('../input/gb-rwbt-files')