In [None]:
!pip install lofo-importance -q

In [None]:
import numpy as np
import pandas as pd
#from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.special import gamma
from tqdm import tqdm

PATH = "../input/ventilator-pressure-prediction"

df = pd.read_csv(f"{PATH}/train.csv")
print(df.shape)
df.head()

In [None]:
df['400'] = (df.index.to_series() / 400).astype(int)
df['8000'] = (df.index.to_series() / 8000).astype(int)

In [None]:
def realized_quarticity(series):
    return np.sum(series**4)*series.shape[0]/3

def realized_quadpower_quarticity(series):
    series = abs(series.rolling(window=4).apply(np.prod, raw=True))
    return (np.sum(series) * series.shape[0] * (np.pi**2))/4

def realized_tripower_quarticity(series):
    series = series ** (4/3)
    series = abs(series).rolling(window=3).apply(np.prod, raw=True)
    return series.shape[0]*0.25*((gamma(1/2)**3)/(gamma(7/6)**3))*np.sum(series)

def realized_1(series):
    return np.sqrt(np.sum(series**4)/(6*np.sum(series**2)))

def realized_2(series):
    return np.sqrt(((np.pi**2)*np.sum(abs(series.rolling(window=4).apply(np.prod, raw=True))))/(8*np.sum(series**2)))

def realized_3(series):
    numerator = (gamma(1/2)**3)*np.sum((abs(series)**(4/3)).rolling(window=3).apply(np.prod))
    denominator = 8 * (gamma(7/6)**3)*np.sum(series**2)
    return np.sqrt(numerator/denominator)

def bvar(series):
    series = abs(series).rolling(window=2).apply(np.prod, raw=True)
    return np.sum(series)*((2/np.pi)**(-2))

In [None]:
def engineer_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df["area_max"] = df.groupby("breath_id")["area"].transform("max")
    #df['area_diff'] = df.area.diff()
    df['area_w5_prod'] = abs(df.area).rolling(window=5).apply(np.prod, raw=True)
    #df['area_bvar'] = df.groupby("breath_id")["area"].transform(bvar)
    #df['area_w40_prod'] = abs(df.area).rolling(window=40).apply(np.prod, raw=True)
    #df['area_w5_std'] = abs(df.area).rolling(window=5).apply(np.std, raw=True)
    
    #df["u_in_std"] = df.groupby("breath_id")["u_in"].transform("std")
    #df["u_in_var"] = df.groupby("breath_id")["u_in"].transform("var")
    #df["u_in_min"] = df.groupby("breath_id")["u_in"].transform("min")
    df["u_in_max"] = df.groupby("breath_id")["u_in"].transform("max")
    df["u_in_sum"] = df.groupby("breath_id")["u_in"].transform("sum")
    df['u_in_bvar'] = df.groupby("breath_id")["u_in"].transform(bvar)
    df["u_in_lag1"] = df.groupby("breath_id")["u_in"].shift(1)
    df["u_in_lag2"] = df.groupby("breath_id")["u_in"].shift(2)
    df["u_in_lag3"] = df.groupby("breath_id")["u_in"].shift(3)
    #df["u_in_400_std"] = df.groupby("400")["u_in"].transform("std")
    #df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    #df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    #df["u_in_lag40"] = df.groupby("breath_id")["u_in"].shift(40)
    #df["u_in_lag48"] = df.groupby("breath_id")["u_in"].shift(48)
    #df["u_in_lag80"] = df.groupby("breath_id")["u_in"].shift(80)
    #df["u_in_diff"] = df["u_in"].diff()
    df["u_in_cumsum"] = df.groupby("breath_id")["u_in"].cumsum()
    df["u_in_cumsum_reverse"] = df["u_in_sum"] - df["u_in_cumsum"]
    #df["u_in_quarticity"] = df.groupby("breath_id")["u_in"].transform(realized_quarticity)
    #df["u_in_w10_realized1"] = (df.u_in).rolling(window=10).apply(realized_1)
    
    df["u_out_sum"] = df.groupby("breath_id")["u_out"].transform("sum")
    #df['u_out_bvar'] = df.groupby("breath_id")["u_out"].transform(bvar)
    #df["u_out_max"] = df.groupby("breath_id")["u_out"].transform("max")
    df["u_out_cumsum"] = df.groupby("breath_id")["u_out"].cumsum()
    df["u_out_cumsum_reverse"] = df["u_out_sum"] - df["u_out_cumsum"]
    #df['u_out_w5_prod'] = abs(df.u_out).rolling(window=5).apply(np.prod, raw=True)
    #df['u_out_w40_std'] = df.u_out.rolling(window=40).apply(np.std, raw=True)
    #df['u_out_lag'] = df.groupby('breath_id')['u_out'].shift(1)
    
    df["time_passed"] = df.groupby("breath_id")["time_step"].diff()
    
    #df['cross']= df['u_in']*df['u_out']
    #df['cross2']= df['time_step']*df['u_out']
    
    #df['R_lag40']  = df.groupby("breath_id")["R"].shift(40)
    #df['R_lag48']  = df.groupby("breath_id")["R"].shift(48)
    #df['R_lag80']  = df.groupby("breath_id")["R"].shift(80)
    #df['R_lag96']  = df.groupby("breath_id")["R"].shift(96)
    df['RC'] = df["R"] + df["C"]
    df['R-C']= df.R - df.C
    #df["RC_400_std"] = df.groupby("400")["RC"].transform("std")
    df["RC_400_mean"] = df.groupby("400")["RC"].transform("mean")
    #df['R-C_400_bvar']= df.groupby("400")['R-C'].transform(bvar)
    df['R-C_8000_mean']= df.groupby("8000")['R-C'].transform("mean")
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    
    #df['R__C_40'] = df['R__C'].shift(40)
    #df['R_C_w5_sum'] = df.R.rolling(window=5).apply(np.sum, raw=True)+df.C.rolling(window=5).apply(np.sum, raw=True)
    #df['R_C_w40_sum'] = df.R.rolling(window=40).apply(np.sum, raw=True)+df.C.rolling(window=40).apply(np.sum, raw=True)
    #df['R_w20_r3'] = df.R.rolling(window=20).apply(realized_3)
    #df['RC_w20_sum'] = (df.R+df.C).rolling(window=20).apply(np.sum)
    #df['RC_w120_max'] = (df.R+df.C).rolling(window=20).apply(np.max)
    #df['RC_w200_std'] = (df.R+df.C).rolling(window=200).apply(np.std)
    #df['RC_w400_rq'] = (df.R+df.C).rolling(window=400).apply(realized_quarticity)
    return df
    
df = engineer_features(df)

In [None]:
df['ind'] = [x for x in range(40)]*int(len(df)/40)
df['ind'] = df['ind'].astype(str)
#train_p = df.pivot_table(index='breath_id',columns='ind', values=['pressure'], aggfunc='mean').corr()
#corr = train_p.corr()

#ids = corr.index

#kmeans = KMeans(n_clusters=5, random_state=0).fit(corr.values)
#d = pd.DataFrame({'ind':[x for x in range(40)], 'kmeans':kmeans.labels_.astype(str)})
#df = df.merge(d, how='left')

In [None]:
X = df.drop(['pressure','id','breath_id','u_out','R','C','area',"u_in",'time_step','400','RC','u_in_sum',"R-C",'u_out_sum'],axis=1)

In [None]:
X.columns

In [None]:
from lofo import Dataset, LOFOImportance, plot_importance
from sklearn.model_selection import TimeSeriesSplit

cv = list(TimeSeriesSplit(n_splits=4).split(X, df["pressure"]))

features = X.columns

ds = Dataset(df, target="pressure", features=features,
    feature_groups=None,
    auto_group_threshold=1)

In [None]:
lofo_imp = LOFOImportance(ds, cv=cv, scoring="neg_mean_absolute_error")

importance_df = lofo_imp.get_importance()
importance_df

In [None]:
plot_importance(importance_df, figsize=(8, 8))