In [1]:
import sys

from pathlib import Path
import numpy as np
import pandas as pd
import scipy.signal
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import copy
import pickle


import warnings
warnings.simplefilter("ignore")

In [2]:
class CFG:
    exp_num = 1
    n_folds = 10
    folds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    seed = 777
    local = True


In [3]:
DATA_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/ventilator-pressure-prediction")
OUTPUT_DIR = Path('./output/')
OOF_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/team_oofs_stacking")
SUB_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/team_subs_stacking")
PICKLE_DIR = Path("/home/knikaido/work/Ventilator-Pressure-Prediction/data/team_stacking_pickle")

In [4]:
sys.path.append('../../src/')
import utils as utils
from utils import Timer

In [5]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")

In [6]:
oof_paths = sorted(list(OOF_DIR.rglob('*.npy'))+list(OOF_DIR.rglob('*.csv')))
sub_paths = sorted(list(SUB_DIR.rglob('*.npy'))+list(SUB_DIR.rglob('*.csv')))

len(oof_paths), len(sub_paths)

(29, 29)

In [7]:
def read_team_preds(paths):
    oofs = []
    for path in paths:
        path = str(path)
        if '.csv' in path:
            try:
                oof_ = pd.read_csv(path, usecols=['pressure']).values.reshape(-1)

            except:
                oof_ = pd.read_csv(path, usecols=['pred']).values.reshape(-1)
                print(type(oof_))
        else:
            try:
                oof_ = np.load(path)
            except:
                oof_ = np.load(path).reshape(-1)
#         print(f'loaded {path}')

        oofs.append(oof_)
    print('preparation done!')
    oofs = np.array(oofs)
    return oofs

In [8]:
oofs = read_team_preds(oof_paths)
oofs = pd.DataFrame(oofs.T, columns=[f'pred_{str(i)}' for i in range(oofs.shape[0])])
oofs

preparation done!


Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_19,pred_20,pred_21,pred_22,pred_23,pred_24,pred_25,pred_26,pred_27,pred_28
0,5.789477,5.833371,5.831460,5.797319,5.787222,5.994471,5.872724,6.023309,5.764046,5.898385,...,5.820899,5.523668,6.056924,6.057409,5.862573,5.773038,5.810369,5.941694,5.814224,5.805068
1,5.863910,5.938485,5.860644,5.877145,5.887587,5.921943,5.835646,5.845187,5.866303,5.857195,...,5.824658,5.929617,5.840458,5.837662,5.848502,5.877314,5.871547,5.846353,5.847313,5.846550
2,8.144781,8.245914,8.177640,8.098116,8.147405,8.133663,8.057798,7.962515,8.049034,7.991342,...,8.030790,8.095352,8.155348,8.152152,7.955888,8.036733,8.126698,8.084612,8.069213,7.975787
3,12.079843,12.079426,12.082666,12.083275,12.057881,12.146135,12.018413,11.971588,12.080100,12.059141,...,11.877783,12.134422,12.096680,12.099493,12.016686,11.990269,12.084424,12.239206,12.118088,12.002528
4,12.521106,12.523297,12.601661,12.628415,12.538396,12.624945,12.497830,12.531747,12.527015,12.567769,...,12.507798,12.600920,12.587675,12.584952,12.537569,12.526273,12.546120,12.477226,12.604493,12.502975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290963,29.489279,29.465751,29.463932,29.463486,29.468914,29.457060,29.478907,29.481145,29.462510,29.461234,...,29.476760,29.459700,29.463757,29.463902,29.461472,29.469219,29.480509,29.463127,29.460402,29.423048
2290964,29.105459,29.107071,29.133446,29.130894,29.119773,29.121709,29.121944,29.114014,29.110409,29.117303,...,29.140826,29.138840,29.125188,29.125477,29.109639,29.110482,29.120060,29.111472,29.111569,29.086384
2290965,29.884453,29.874415,29.894176,29.887880,29.878339,29.875105,29.910615,29.910456,29.880919,29.886303,...,29.887930,29.883578,29.876305,29.876951,29.880339,29.886278,29.890086,29.896856,29.892732,29.874448
2290966,29.383950,29.389308,29.401897,29.391508,29.391641,29.381354,29.392324,29.388630,29.387965,29.375070,...,29.380993,29.397045,29.390387,29.390009,29.373224,29.380328,29.373083,29.391071,29.388413,29.382864


In [11]:
subs = read_team_preds(sub_paths)
subs = pd.DataFrame(subs.T, columns=[f'pred_{str(i)}' for i in range(subs.shape[0])])
subs

preparation done!


Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,...,pred_19,pred_20,pred_21,pred_22,pred_23,pred_24,pred_25,pred_26,pred_27,pred_28
0,6.271525,6.274139,6.250293,6.239222,6.234902,6.230631,6.248490,6.248912,6.258742,6.254150,...,6.255715,6.270192,6.209339,6.208601,6.234668,6.242348,6.259692,6.234512,6.244083,6.239510
1,6.002286,6.000910,5.974248,5.962183,5.972803,5.948584,5.974461,5.974298,5.986993,5.981585,...,5.977430,5.970970,5.954226,5.952595,5.970536,5.979932,5.984572,5.982571,5.995547,5.973396
2,7.146135,7.145276,7.143232,7.136650,7.127580,7.123176,7.113478,7.114281,7.171972,7.165225,...,7.164241,7.154575,7.145734,7.144103,7.120587,7.118664,7.120526,7.149453,7.138753,7.121976
3,7.743807,7.765169,7.723122,7.719956,7.751527,7.668999,7.738619,7.740086,7.750413,7.737912,...,7.752016,7.730515,7.696521,7.695406,7.684845,7.707492,7.684567,7.772260,7.740436,7.688272
4,9.226192,9.219263,9.246565,9.242394,9.211786,9.196526,9.228693,9.215903,9.233153,9.219219,...,9.225886,9.231619,9.209006,9.207569,9.221489,9.240027,9.252678,9.230925,9.249313,9.231130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023995,13.851959,13.629684,13.657732,13.534072,0.000000,0.000000,0.000000,0.000000,13.967936,14.155994,...,13.823712,13.164266,15.257716,15.238763,14.365964,14.153107,15.354063,14.213044,14.063761,13.676671
4023996,14.090833,13.785939,13.524315,13.385818,0.000000,0.000000,0.000000,0.000000,14.245968,14.111152,...,13.816230,13.322442,15.473152,15.463561,14.781502,14.449735,15.355318,14.399313,13.898353,14.014549
4023997,14.094390,14.146443,13.568069,13.511240,0.000000,0.000000,0.000000,0.000000,14.316333,14.174723,...,13.984568,13.530339,15.764007,15.758864,14.877322,14.795870,15.515092,14.419702,14.630459,14.213983
4023998,14.462595,14.354524,14.448002,14.427994,0.000000,0.000000,0.000000,0.000000,14.543948,14.750629,...,14.286920,13.157774,15.892453,15.888368,15.085758,14.273345,16.102294,14.306656,14.117593,14.409499


In [12]:
def get_raw_features(input_df, dataType = 'train'):
    colum = ['time_step', 'u_in', 'R', 'C']

    return input_df[colum]

In [13]:
def get_category_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    colum = ['R_C']
    rc_map = {'5_10': 0, '5_20': 1, '5_50': 2, '20_10': 3, '20_20': 4, '20_50': 5, '50_10': 6, '50_20': 7, '50_50': 8}
    
    output_df['R_C'] = [f'{r}_{c}' for r, c in zip(output_df['R'], output_df['C'])]
    output_df['R_C'] = output_df['R_C'].map(rc_map)

    return output_df[colum]

In [14]:
def get_simple_calc_features(input_df, dataType = 'train'):
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['time_delta'] = output_df.groupby('breath_id')['time_step'].diff().fillna(0)
    output_df['delta'] = output_df['time_delta'] * output_df['u_in']
    output_df['area'] = output_df.groupby('breath_id')['delta'].cumsum()

    output_df['cross']= output_df['u_in']*output_df['u_out']
    output_df['cross2']= output_df['time_step']*output_df['u_out']
    
    output_df['u_in_cumsum'] = (output_df['u_in']).groupby(output_df['breath_id']).cumsum()
    output_df['one'] = 1
    output_df['count'] = (output_df['one']).groupby(output_df['breath_id']).cumsum()
    output_df['u_in_cummean'] =output_df['u_in_cumsum'] / output_df['count']
    
    output_df['u_in_sqrt'] = output_df['u_in'].apply(lambda x: np.sqrt(x))
    output_df['u_in_sqrt_cumsum'] = output_df.groupby('breath_id')['u_in_sqrt'].cumsum()
    
    output_df = output_df.drop(['count','one'], axis=1)
    
    return output_df.iloc[:, c_num:]

In [15]:
def get_agg_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    # Dict for aggregations
    create_feature_dict = {
        'u_in': [np.max, np.mean],
    }
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id']).agg(create_feature_dict)
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(start_time) + '_' + str(end_time))
            
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    
#     df_tmp = get_agg_window(start_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(start_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 1, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')
#     df_tmp = get_agg_window(end_time = 2, add_suffix = True).reset_index()
#     df_agg_feature = df_agg_feature.merge(df_tmp, how = 'left', on = 'breath_id')

    output_df = pd.merge(output_df, df_agg_feature, how='left', on='breath_id')
    
    output_df['u_in_diffmax'] = output_df['u_in_amax'] - output_df['u_in']
    output_df['u_in_diffmean'] = output_df['u_in_mean'] - output_df['u_in']
    
#     output_df = output_df.drop(['u_in_amax','u_in_mean'], axis=1)
    
    return output_df.iloc[:, c_num:]

In [16]:
def get_half_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['tmp'] = output_df['u_out']*(-1)+1 # inversion of u_out
    output_df['u_in_half'] = output_df['tmp'] * output_df['u_in']
    
#     u_in_half_max_dict = train.groupby('breath_id')['u_in_half'].max().to_dict()
#     train['u_in_half_max'] = train['breath_id'].map(u_in_half_max_dict)
#     u_in_half_min_dict = train.groupby('breath_id')['u_in_half'].min().to_dict()
#     train['u_in_half_min'] = train['breath_id'].map(u_in_half_min_dict)
    u_in_half_mean_dict = output_df.groupby('breath_id')['u_in_half'].mean().to_dict()
    output_df['u_in_half_mean'] = output_df['breath_id'].map(u_in_half_mean_dict)
#     u_in_half_std_dict = train.groupby('breath_id')['u_in_half'].std().to_dict()
#     train['u_in_half_std'] = train['breath_id'].map(u_in_half_std_dict)

    del output_df['u_in_half'], output_df['tmp']
    return output_df.iloc[:, c_num:]

In [17]:
def lowpass_filter(series, b, a):
    return signal.filtfilt(b, a, series)    

In [18]:
def get_filter_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    fp = 5 # 通過域端周波数[Hz]
    fs = 10 # 阻止域端周波数[Hz]
    gpass = 3 # 通過域端最大損失[dB]
    gstop = 40 # 阻止域端最小損失[dB]
    samplerate = 100

    fn = samplerate / 2   #ナイキスト周波数
    wp = fp / fn  #ナイキスト周波数で通過域端周波数を正規化
    ws = fs / fn  #ナイキスト周波数で阻止域端周波数を正規化
    N, Wn = signal.buttord(wp, ws, gpass, gstop)  #オーダーとバターワースの正規化周波数を計算
    b, a = signal.butter(N, Wn, "low")            #フィルタ伝達関数の分子と分母を計算
    
    def get_agg_window(start_time=0, end_time=3.0, add_suffix = False):
        
        df_tgt = output_df[(output_df['time_step'] >= start_time) & (output_df['time_step'] <= end_time)]
        df_feature = df_tgt.groupby(['breath_id'])['u_in'].apply(lowpass_filter, b=b, a=a)
        df_feature.name = 'u_in_filter'
                    
        return df_feature
    
    df_agg_feature = get_agg_window().reset_index()
    df_agg_feature = df_agg_feature.explode("u_in_filter").reset_index(drop=True)
    df_agg_feature['u_in_filter'] = df_agg_feature['u_in_filter'].astype(float)
        
    df_agg_feature['u_in_filter_cumsum'] = df_agg_feature.groupby('breath_id')['u_in_filter'].cumsum()

    return df_agg_feature.iloc[:, 1:]

In [19]:
def get_vib_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    output_df['u_out_diff'] = output_df['u_out'].diff()
    output_df['u_out_diff'].fillna(0, inplace=True)
    output_df['u_out_diff'].replace(-1, 0, inplace=True)
    uout1_df = output_df[output_df['u_out_diff']==1]
    
    first_df = output_df.loc[0::80,:]
    first_0_dict = dict(zip(first_df['id'], [0]*len(uout1_df)))

    output_df['u_in_diff'] = output_df['u_in'].diff()
    output_df['diff_sign'] = np.sign(output_df['u_in_diff'])
    output_df['sign_diff'] = output_df['diff_sign'].diff()
    output_df['tmp'] = output_df['id'].map(first_0_dict) # put 0, the 80row cycle
    output_df.iloc[0::80, output_df.columns.get_loc('sign_diff')] = output_df.iloc[0::80, output_df.columns.get_loc('tmp')]

    # Count the number of inversions, so take the absolute value and sum
    output_df['sign_diff'] = abs(output_df['sign_diff']) 
    sign_diff_dict = output_df.groupby('breath_id')['sign_diff'].sum().to_dict()
    output_df['diff_vib'] = output_df['breath_id'].map(sign_diff_dict)
    
    return output_df['sign_diff']

In [20]:
def add_time_features(out_df, input_df, dataType = 'train'):

    USE_LAG = [-2, -1, 1, 2, 3, 4]
    lag_map = {-2: 1, -1: 2, 1: 3, 2: 4, 3: 5, 4: 6}

    out_df['breath_id'] = input_df['breath_id']
    
    for lag in USE_LAG:
        out_df[f'breath_id_lag{lag_map[lag]}']=out_df['breath_id'].shift(lag).fillna(0)
        out_df[f'breath_id_lag{lag_map[lag]}same']=np.select([out_df[f'breath_id_lag{lag_map[lag]}']==out_df['breath_id']], [1], 0)

        # u_in_filter
        out_df[f'u_in_filter_lag_{lag_map[lag]}'] = out_df['u_in_filter'].shift(lag).fillna(0) * out_df[f'breath_id_lag{lag_map[lag]}same']
        out_df[f'u_in_filter_diff_{lag_map[lag]}'] = out_df['u_in_filter'] - out_df[f'u_in_filter_lag_{lag_map[lag]}']
        # u_in_sqrt
        out_df[f'u_in_sqrt_lag_{lag_map[lag]}'] = out_df['u_in_sqrt'].shift(lag).fillna(0) * out_df[f'breath_id_lag{lag_map[lag]}same']
        out_df[f'u_in_sqrt_diff_{lag_map[lag]}'] = out_df['u_in_sqrt'] - out_df[f'u_in_sqrt_lag_{lag_map[lag]}']

        # u_in 
        out_df[f'u_in_lag_{lag_map[lag]}'] = out_df['u_in'].shift(lag).fillna(0) * out_df[f'breath_id_lag{lag_map[lag]}same']
        out_df[f'u_in_diff_{lag_map[lag]}'] = out_df['u_in'] - out_df[f'u_in_lag_{lag_map[lag]}']
        # u_out
        out_df[f'u_out_lag_{lag_map[lag]}'] = out_df['u_out'].shift(lag).fillna(0) * out_df[f'breath_id_lag{lag_map[lag]}same']

        # breath_time
    out_df[f'time_step_lag_{1}'] = out_df['time_step'].shift(1).fillna(0) * out_df[f'breath_id_lag{1}same']
    out_df[f'time_step_diff_{1}'] = out_df['time_step'] - out_df[f'time_step_lag_{1}']
        
    drop_columns = ['breath_id', 'time_step_lag_1']
    drop_columns += [f'breath_id_lag{lag_map[i]}' for i in USE_LAG]
    drop_columns += [f'breath_id_lag{lag_map[i]}same' for i in USE_LAG]
    out_df = out_df.drop(drop_columns, axis=1)
    out_df = out_df.fillna(0)
    
    return out_df

In [21]:
def get_oof_features(input_df, dataType = 'train'):
    
    output_df = copy.deepcopy(input_df)
    c_num = input_df.shape[1]
    
    for i in range(len(pred_cols)):
        output_df[f"pred_{i}"] = 0.
        output_df.loc[oof["u_out"] == 0, f"pred_{i}"] = _oof[f"oof{i}"].values
    

    
    return output_df['sign_diff']

In [22]:
def to_feature(input_df, dataType = 'train'):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        get_raw_features,
#         get_simple_calc_features,
#         get_agg_features,
#         get_vib_features,
#         get_half_features,
        get_category_features,
#         get_filter_features,
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='' + func.__name__ + ' '):
            _df = func(input_df, dataType)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)
#     out_df = utils.reduce_mem_usage(out_df)
#     out_df = add_time_features(out_df, input_df)
    out_df_cols = sorted(list(out_df))
    out_df = out_df[out_df_cols]
    
    return out_df

In [23]:
train_df = to_feature(train, dataType = 'train')
test_df = to_feature(test, dataType = 'test')

  0%|          | 0/2 [00:00<?, ?it/s]

get_raw_features  0.028[s]


100%|██████████| 2/2 [00:02<00:00,  1.23s/it]
  0%|          | 0/2 [00:00<?, ?it/s]

get_category_features  2.374[s]
get_raw_features  0.018[s]


100%|██████████| 2/2 [00:01<00:00,  1.28it/s]

get_category_features  1.499[s]





In [24]:
pred_cols = [f"pred_{i}" for i in range(len(oofs.columns))]

In [28]:
def add_oof_features(input_df, dataType = 'train'):
    
    if dataType == 'train':
        for i in range(len(pred_cols)):
            input_df[f"pred_{i}"] = 0.
            input_df.loc[train["u_out"] == 0, f"pred_{i}"] = oofs[f"pred_{i}"].values
        input_df['breath_id'] = train['breath_id']
        input_df = train_df.loc[train["u_out"] == 0].reset_index(drop=True)
    else:
        for i in range(len(pred_cols)):
            input_df[f"pred_{i}"] = 0.
            input_df.loc[:, f"pred_{i}"] = subs[f"pred_{i}"].values
        input_df['breath_id'] = test['breath_id']
        input_df = input_df.loc[test["u_out"] == 0].reset_index(drop=True)
      
    # v2
    input_df["pred_mean"] = np.mean(input_df[pred_cols].values, axis=1)
    input_df["pred_median"] = np.median(input_df[pred_cols].values, axis=1)

    input_df["pred_std"] = input_df[pred_cols].std(axis=1)
    input_df["pred_max"] = input_df[pred_cols].values.max(axis=1)
    input_df["pred_min"] = input_df[pred_cols].values.min(axis=1)
    input_df["pred_max-min"] = input_df["pred_max"] - input_df["pred_min"]
    input_df["pred_max-median"] = input_df["pred_max"] - input_df["pred_median"]
    input_df["pred_max-mean"] = input_df["pred_max"] - input_df["pred_mean"]
    input_df["pred_median-min"] = input_df["pred_median"] - input_df["pred_min"]
    input_df["pred_mean-min"] = input_df["pred_mean"] - input_df["pred_min"]
    input_df["pred_mean-median"] = input_df["pred_mean"] - input_df["pred_median"]
    input_df["pred_kurt"] = input_df[pred_cols].kurt(axis=1)
    for col_ in pred_cols:
        input_df[f"{col_}_past_1"] = input_df.groupby("breath_id")[f"{col_}"].shift(1)
        input_df[f"{col_}_past_2"] = input_df.groupby("breath_id")[f"{col_}"].shift(2)
        input_df[f"{col_}_past_3"] = input_df.groupby("breath_id")[f"{col_}"].shift(3)
        input_df[f"{col_}_past_4"] = input_df.groupby("breath_id")[f"{col_}"].shift(4)

        input_df[f"{col_}_diff_1"] = input_df[f"{col_}"] - input_df[f"{col_}_past_1"]
        input_df[f"{col_}_diff_2"] = input_df[f"{col_}"] - input_df[f"{col_}_past_2"]
        input_df[f"{col_}_diff_3"] = input_df[f"{col_}"] - input_df[f"{col_}_past_3"]
        input_df[f"{col_}_diff_4"] = input_df[f"{col_}"] - input_df[f"{col_}_past_4"]

    input_df["u_in_past_1"] = input_df.groupby("breath_id")["u_in"].shift(1)
    input_df["u_in_past_2"] = input_df.groupby("breath_id")["u_in"].shift(2)
    input_df["u_in_past_3"] = input_df.groupby("breath_id")["u_in"].shift(3)
    input_df["u_in_past_4"] = input_df.groupby("breath_id")["u_in"].shift(4)

    input_df["u_in_diff_1"] = input_df["u_in"] - input_df["u_in_past_1"]
    input_df["u_in_diff_2"] = input_df["u_in"] - input_df["u_in_past_2"]
    input_df["u_in_diff_3"] = input_df["u_in"] - input_df["u_in_past_3"]
    input_df["u_in_diff_4"] = input_df["u_in"] - input_df["u_in_past_4"]

    input_df["u_in_cumsum"] = input_df.groupby("breath_id")["u_in"].cumsum()

    del input_df['breath_id']
    
    return input_df

In [29]:
train_df = add_oof_features(train_df, dataType = 'train')
test_df = add_oof_features(test_df, dataType = 'test')

In [30]:
display(train_df), display(test_df)

Unnamed: 0,C,R,R_C,time_step,u_in,pred_0,pred_1,pred_2,pred_3,pred_4,...,pred_28_diff_4,u_in_past_1,u_in_past_2,u_in_past_3,u_in_past_4,u_in_diff_1,u_in_diff_2,u_in_diff_3,u_in_diff_4,u_in_cumsum
0,50,20,5,0.000000,0.083334,5.789477,5.833371,5.831460,5.797319,5.787222,...,,,,,,,,,,0.083334
1,50,20,5,0.033652,18.383041,5.863910,5.938485,5.860644,5.877145,5.887587,...,,0.083334,,,,18.299707,,,,18.466375
2,50,20,5,0.067514,22.509278,8.144781,8.245914,8.177640,8.098116,8.147405,...,,18.383041,0.083334,,,4.126236,22.425944,,,40.975653
3,50,20,5,0.101542,22.808822,12.079843,12.079426,12.082666,12.083275,12.057881,...,,22.509278,18.383041,0.083334,,0.299544,4.425781,22.725488,,63.784476
4,50,20,5,0.135756,25.355850,12.521106,12.523297,12.601661,12.628415,12.538396,...,6.697906,22.808822,22.509278,18.383041,0.083334,2.547028,2.846573,6.972809,25.272516,89.140326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290963,10,50,6,0.834147,1.869367,29.489279,29.465751,29.463932,29.463486,29.468914,...,1.026603,2.650333,2.438287,3.783043,3.209590,-0.780965,-0.568920,-1.913676,-1.340223,238.890288
2290964,10,50,6,0.867574,2.154414,29.105459,29.107071,29.133446,29.130894,29.119773,...,1.305693,1.869367,2.650333,2.438287,3.783043,0.285047,-0.495918,-0.283873,-1.628629,241.044703
2290965,10,50,6,0.900917,1.304434,29.884453,29.874415,29.894176,29.887880,29.878339,...,0.842991,2.154414,1.869367,2.650333,2.438287,-0.849980,-0.564933,-1.345899,-1.133853,242.349137
2290966,10,50,6,0.934309,1.733830,29.383950,29.389308,29.401897,29.391508,29.391641,...,0.618536,1.304434,2.154414,1.869367,2.650333,0.429396,-0.420585,-0.135538,-0.916503,244.082966


Unnamed: 0,C,R,R_C,time_step,u_in,pred_0,pred_1,pred_2,pred_3,pred_4,...,pred_28_diff_4,u_in_past_1,u_in_past_2,u_in_past_3,u_in_past_4,u_in_diff_1,u_in_diff_2,u_in_diff_3,u_in_diff_4,u_in_cumsum
0,20,5,1,0.000000,0.000000,6.271525,6.274139,6.250293,6.239222,6.234902,...,,,,,,,,,,0.000000
1,20,5,1,0.031904,7.515046,6.002286,6.000910,5.974248,5.962183,5.972803,...,,0.000000,,,,7.515046,,,,7.515046
2,20,5,1,0.063827,14.651675,7.146135,7.145276,7.143232,7.136650,7.127580,...,,7.515046,0.000000,,,7.136630,14.651675,,,22.166721
3,20,5,1,0.095751,21.230610,7.743807,7.765169,7.723122,7.719956,7.751527,...,,14.651675,7.515046,0.000000,,6.578935,13.715564,21.230610,,43.397331
4,20,5,1,0.127644,26.320956,9.226192,9.219263,9.246565,9.242394,9.211786,...,2.991620,21.230610,14.651675,7.515046,0.0,5.090346,11.669281,18.805911,26.320956,69.718287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1527560,10,20,3,0.842145,0.000000,10.097620,10.130657,10.100893,10.102055,10.112495,...,0.026887,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,68.839645
1527561,10,20,3,0.875648,0.000000,10.073868,10.072578,10.091086,10.091650,10.079336,...,-0.014565,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,68.839645
1527562,10,20,3,0.909185,0.121375,9.985326,9.985189,9.982014,9.981457,9.985294,...,-0.142230,0.000000,0.000000,0.000000,0.0,0.121375,0.121375,0.121375,0.121375,68.961019
1527563,10,20,3,0.943148,0.000000,10.056653,10.060103,10.078653,10.077803,10.058278,...,-0.070471,0.121375,0.000000,0.000000,0.0,-0.121375,0.000000,0.000000,0.000000,68.961019


(None, None)

In [31]:
train_category_col = ['R_C']
train_value_col = [i for i in train_df.columns.to_list() if i not in train_category_col]

In [32]:
norm_features = train_value_col
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)
scaler = RobustScaler()
scaler.fit(train_df[norm_features])
train_df[norm_features] = scaler.transform(train_df[norm_features].values)
test_df[norm_features] = scaler.transform(test_df[norm_features].values)

In [33]:
y = train.loc[train["u_out"] == 0]['pressure'].reset_index(drop=True)

In [34]:
train_df = pd.concat([train_df, train.loc[train["u_out"] == 0, ['id', 'breath_id', 'pressure']].reset_index(drop=True)], axis=1)
test_df = pd.concat([test_df, test.loc[test["u_out"] == 0,['id', 'breath_id']].reset_index(drop=True)], axis=1)

In [35]:
oof_prediction = np.zeros(len(train_df))
test_preds_lst = []
input_dim = len(train_value_col)
train_df['pred'] = 0
train_gby = train_df.groupby('breath_id')['R_C'].agg('first').reset_index()
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
models = []


fold_df = pd.DataFrame()
fold_df["id"] = train["id"]
fold_df["fold"] = -1

for fold, (_, valid_idx) in enumerate(skf.split(train_gby, train_gby['R_C'])):        
    valid_b_ids = train_gby.iloc[valid_idx]['breath_id'].values
    valid_df_idx = train_df[train_df['breath_id'].isin(valid_b_ids)].index.to_list()
    fold_df.loc[valid_df_idx, 'fold'] = fold

for i, fold in enumerate(range(CFG.n_folds)):
    if i not in CFG.folds:
        continue
    print(f'Fold-{fold}')
    
    train_idx = fold_df[fold_df["fold"] != fold].index
    valid_idx = fold_df[fold_df["fold"] == fold].index
    
    trn_df = train_df.loc[fold_df["fold"] != fold, train_value_col].reset_index(drop=True)
    val_df = train_df.loc[fold_df["fold"] == fold, train_value_col].reset_index(drop=True)
    trn_y = train_df.loc[fold_df["fold"] != fold, 'pressure'].reset_index(drop=True)
    val_y = train_df.loc[fold_df["fold"] == fold, 'pressure'].reset_index(drop=True)
    
    model = RidgeCV()
    model.fit(trn_df, trn_y)
    
    models.append(model)
    
    oof_prediction[valid_idx] = model.predict(val_df[train_value_col])
    test_pred = model.predict(test_df[train_value_col])
    test_preds_lst.append(test_pred)
    score = np.abs(val_y.values - oof_prediction[valid_idx]).mean()
    print(f'fold = {fold}, score = {score}')

Fold-0
fold = 0, score = 0.11920521722333165
Fold-1
fold = 1, score = 0.1225320108155299
Fold-2
fold = 2, score = 0.11753776290238384
Fold-3
fold = 3, score = 0.12153904564502799
Fold-4
fold = 4, score = 0.12040505608709785
Fold-5
fold = 5, score = 0.12037001073511287
Fold-6
fold = 6, score = 0.11956284327451377
Fold-7
fold = 7, score = 0.12170841316983051
Fold-8
fold = 8, score = 0.11947136570926889
Fold-9
fold = 9, score = 0.12011013656620144


In [36]:
CV = np.abs(y.values - oof_prediction).mean()
CV

0.12024415589578497

In [61]:
np.save(OUTPUT_DIR / f"stacking2_oof_{CFG.exp_num}", oof_prediction)

In [58]:
unique_pressures = train["pressure"].unique()
sorted_pressures = np.sort(unique_pressures)
total_pressures_len = len(sorted_pressures)

def find_nearest(prediction):
    insert_idx = np.searchsorted(sorted_pressures, prediction)
    if insert_idx == total_pressures_len:
        # If the predicted value is bigger than the highest pressure in the train dataset,
        # return the max value.
        return sorted_pressures[-1]
    elif insert_idx == 0:
        # Same control but for the lower bound.
        return sorted_pressures[0]
    lower_val = sorted_pressures[insert_idx - 1]
    upper_val = sorted_pressures[insert_idx]
    return lower_val if abs(lower_val - prediction) < abs(upper_val - prediction) else upper_val

In [59]:
oof = pd.DataFrame({'pred': oof_prediction})
oof_pp = oof['pred'].map(lambda x: unique_pressures[np.abs(unique_pressures-x).argmin()])
score = np.abs(y.values - oof_pp).mean()
print(score)

0.11979720423723357


In [60]:
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")

sub_df.loc[test['u_out']==0, 'pressure'] = np.stack(test_preds_lst).mean(0)
sub_df.to_csv(OUTPUT_DIR / f"stacking2_submission_mean_{CFG.exp_num}.csv", index=None)

sub_df.loc[test['u_out']==0, 'pressure'] = np.median(np.stack(test_preds_lst), axis=0)
sub_df.to_csv(OUTPUT_DIR / f"stacking2_submission_median_{CFG.exp_num}.csv", index=None)

# Post Processing: https://www.kaggle.com/snnclsr/a-dummy-approach-to-improve-your-score-postprocess


sub_df = pd.read_csv(OUTPUT_DIR / f"stacking2_submission_mean_{CFG.exp_num}.csv")
sub_df.loc[test['u_out']==0, 'pressure'] = sub_df["pressure"].apply(find_nearest)
sub_df.to_csv(OUTPUT_DIR / f"stacking2_submission_mean_pp_{CFG.exp_num}.csv", index=None)

sub_df = pd.read_csv(OUTPUT_DIR / f"stacking2_submission_median_{CFG.exp_num}.csv")
sub_df.loc[test['u_out']==0, 'pressure'] = sub_df["pressure"].apply(find_nearest)
sub_df.to_csv(OUTPUT_DIR / f"stacking2_submission_median_pp_{CFG.exp_num}.csv", index=None)