In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np
import pandas as pd
import os
import gc
import joblib
from joblib import Parallel, delayed
from sklearn.model_selection import KFold, StratifiedKFold, KFold
from sklearn.preprocessing import MinMaxScaler,QuantileTransformer,RobustScaler, normalize
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss,mean_squared_error,r2_score,mean_absolute_error
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib
from catboost import Pool, CatBoostRegressor
import catboost as cb
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
df_test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

# 特征描述
* id:
* breath_id
* R
* C
* time_step
* u_in
* u_out
* pressure

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
fig = plt.figure(figsize = (13, 8))
rc = ['R', 'C']
for i in rc:
    plt.subplot(2, 2, rc.index(i)+1)
    plt.title(i, y = 1.2, size = 25, fontname = 'monospace', color = 'black')
    a = sns.countplot(x = i, data = df_train, palette = ['#488a99', '#dbae58', '#4b585c'])
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace', size = 12)
    plt.yticks([])
    for j in ['right', 'top']:
        a.spines[j].set_visible(False)
    for j in ['bottom', 'left']:    
        a.spines[j].set_linewidth(1.2)
        
    summ = 0
    for p in a.patches:
        summ += p.get_height()

    for p in a.patches:
        height = p.get_height()
        a.annotate(f'{height}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 13,
                   xytext = (1, -15), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'white')
        a.annotate(f'{round((height/summ) * 100, 1)}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 15,
                   xytext = (1, 13), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'black')   
        
for i in rc:
    plt.subplot(2, 2, rc.index(i)+3)
    a = sns.countplot(x = i, data = df_test, palette = ['#488a99', '#dbae58', '#4b585c'])
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace', size = 12)
    plt.yticks([])
    for j in ['right', 'top']:
        a.spines[j].set_visible(False)
    for j in ['bottom', 'left']:    
        a.spines[j].set_linewidth(1.2)
        
    summ = 0
    for p in a.patches:
        summ += p.get_height()

    for p in a.patches:
        height = p.get_height()
        a.annotate(f'{height}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 13,
                   xytext = (1, -15), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'white')
        a.annotate(f'{round((height/summ) * 100, 1)}%', (p.get_x() + p.get_width() / 2, p.get_height()), 
                   ha = 'center', va = 'center', 
                   size = 15,
                   xytext = (1, 13), 
                   textcoords = 'offset points',
                   fontname = 'monospace', color = 'black')
        
plt.figtext(0.15, 1.1, 'Distribution of lung attributes (R/C)', fontname = 'monospace', size = 30, color = 'black')
plt.figtext(1.03, 0.15, 'TEST', fontname = 'monospace', size = 25, color = 'black', rotation = 90)
plt.figtext(1.03, 0.7, 'TRAIN', fontname = 'monospace', size = 25, color = 'black', rotation = 90)
        
fig.tight_layout(h_pad = 10)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 15))
r, c, plot = [5, 20, 50], [10, 20, 50], 1
for i in range(3):
    rr = r[i]
    for k in range(3):
        cc = c[k]
        br_id = df_train.query('R == @rr & C == @cc').iloc[0,1]
        plt.subplot(3, 3, plot)
        plt.title(f'breath id = {br_id} | R = {rr} | C = {cc}', fontname = 'monospace', size = 14)
        a = sns.lineplot(data = df_train.query("breath_id == @br_id"), x = "time_step", y = "u_in", color = '#4b585c', linewidth = 2)
        sns.lineplot(data = df_train.query("breath_id == @br_id"), x = "time_step", y = "u_out", color = '#dbae58', linewidth = 2)
        sns.lineplot(data = df_train.query("breath_id == @br_id"), x = "time_step", y = "pressure", color = '#488a99', linewidth = 2)
        plt.ylabel('')
        plt.xlabel('time stemp', size = 14, fontname = 'monospace', labelpad = 10)
        plt.xticks(size = 12, fontname = 'monospace')
        plt.yticks(size = 12, fontname = 'monospace')

        for j in ['right', 'top']:
            a.spines[j].set_visible(False)
        for j in ['bottom', 'left']:    
            a.spines[j].set_linewidth(1.2)
            
        plot += 1

plt.figtext(0.01, 1.08, 'Observations on breaths with all possible lung attributes', fontname = 'monospace', size = 30, color = 'black')
plt.figtext(0.35, 1.03, 'u_in', fontname = 'monospace', size = 27, color = '#4b585c')
plt.figtext(0.45, 1.03, 'u_out', fontname = 'monospace', size = 27, color = '#dbae58')
plt.figtext(0.55, 1.03, 'pressure', fontname = 'monospace', size = 27, color = '#488a99')
fig.tight_layout(h_pad = 3)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 12))
plot = 1
for i in range(3):
    rr = r[i]
    for k in range(3):
        cc = c[k]
        plt.subplot(3, 3, plot)
        plt.title(f'R = {rr} | C = {cc}', fontname = 'monospace', size = 15, color = 'black')
        a = sns.kdeplot(df_train.query('time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc')['pressure'], color = '#488a99', shade = True, alpha = 1, linewidth = 1.5, edgecolor = 'black')
        plt.ylabel('')
        plt.xlabel('')
        plt.xticks(size = 12, fontname = 'monospace')
        plt.yticks([])

        for j in ['right', 'top']:
            a.spines[j].set_visible(False)
        for j in ['bottom', 'left']:    
            a.spines[j].set_linewidth(1.2)
            
        plot += 1

y = 1.27
for i in range(3):
    rr = r[i]
    y -= 0.333
    x = -0.315
    for k in range(3):
        cc = c[k]
        x += 0.333
        plt.figtext(x, y, f'Min: {round(df_train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].min(),2)}', fontname = 'monospace', color = 'black')
        plt.figtext(x, y-0.02, f'Max: {round(df_train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].max(),2)}', fontname = 'monospace')
        plt.figtext(x, y-0.04, f'Mean: {round(df_train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].mean(),2)}', fontname = 'monospace', color = 'black')
        plt.figtext(x, y-0.06, f'Median: {round(df_train.query("time_step < 0.000001 & u_in < 0.000001 & R == @rr & C == @cc")["pressure"].median(),2)}', fontname = 'monospace', color = 'black')
        
plt.figtext(0.01, 1.08, 'Distribution of pressure depending on lung attributes', fontname = 'monospace', size = 30, color = 'black')
        
fig.tight_layout(h_pad = 3)
plt.show()

In [None]:
#df=pd.concat([df_train,df_test],axis=0)

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['u_in_lag4'] = df['u_in'].shift(4).fillna(0)
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df = pd.get_dummies(df)
    
    df['ewm_u_in_mean'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).mean().reset_index(level=0,drop=True)
    df['ewm_u_in_std'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).std().reset_index(level=0,drop=True)
    df['ewm_u_in_corr'] = df.groupby('breath_id')['u_in'].ewm(halflife=10).corr().reset_index(level=0,drop=True)
    
    df['rolling_10_mean'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(level=0,drop=True)
    df['rolling_10_max'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).max().reset_index(level=0,drop=True)
    df['rolling_10_std'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).std().reset_index(level=0,drop=True)
    
    df['expand_mean'] = df.groupby('breath_id')['u_in'].expanding(2).mean().reset_index(level=0,drop=True)
    df['expand_max'] = df.groupby('breath_id')['u_in'].expanding(2).max().reset_index(level=0,drop=True)
    df['expand_std'] = df.groupby('breath_id')['u_in'].expanding(2).std().reset_index(level=0,drop=True)
    
    return df

In [None]:
df_train=add_features(df_train)
df_test=add_features(df_test)

In [None]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [None]:
%%time
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [None]:
%%time
def lgb_model(train, test,n_fold=5):
    #提取ID
    train_ids=train["id"]
    test_ids=test["id"]
    #提取标签
    y = train['pressure']    
    # 剔除多余列
    x = train.drop(['id', 'breath_id', 'pressure'], axis = 1)
    x_test = test.drop(['id', 'breath_id'], axis = 1)
    print("x_trian{}".format(x.shape))
    print("x_test{}".format(x_test.shape))
    print("y_train{}".format(y.shape))
    #提取特征名称
    feature_names=list(x.columns)
    #转变为np arrays
    x=np.array(x)
    x_test=np.array(x_test)
    #特征重要性
    feature_importance_values=np.zeros(len(feature_names))
    #存放交叉验证结果
    out_of_fold = np.zeros(x.shape[0])
    #存放预测结果
    test_predictions = np.zeros(x_test.shape[0])
    # k折
    kfold = KFold(n_splits = n_fold, random_state = 10086, shuffle = True)
    #交叉验证和训练分数
    valid_scores=[]
    train_scores=[]
    # 遍历交叉验证
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        x_train, x_val = x[trn_ind], x[val_ind]
        y_train, y_val = y[trn_ind], y[val_ind]
        model = lgb.LGBMRegressor(n_estimators=10000,
                                  learning_rate=0.1,
                                  max_depth=-1,
                                  max_bin=250,
                                  subsample=0.9,
                                  subsample_freq=3,
                                  min_data_in_leaf=500,
                                  boosting_type='gbdt',
                                  feature_fraction=0.65,
                                  lambda_l1=0.3,
                                  lambda_l2=0.3,
                                  n_jobs=-1,
                                  random_state=10086
        )
        model.fit(x_train,
                  y_train,
                  eval_metric="mae",
                  eval_set=[(x_train,y_train),(x_val,y_val)],
                  eval_names=['train','valid'],
                  categorical_feature="auto",
                  early_stopping_rounds=30,
                  verbose=100
        )
       
        #最佳迭代次数
        best_iteration=model.best_iteration_
       
        # 特征重要性
        feature_importance_values+=model.feature_importances_/kfold.n_splits
       
        #预测测试集
        test_predictions += model.predict(x_test,num_iteration=best_iteration) / kfold.n_splits
      
        out_of_fold[val_ind]=model.predict(x_val,num_iteration=best_iteration)
      
        #记录分数
        train_score=mean_absolute_error(y_train, model.predict(x_train,num_iteration=best_iteration))
       
        valid_score=mean_absolute_error(y_val, model.predict(x_val,num_iteration=best_iteration)) 

        #print("train_rmspe:{},valid_rmspe:{}".format(train_rmspe,valid_rmspe,valid_rmspe))
        print('Fold %2d ,train-score : %.6f,valid-score : %.6f' % (fold + 1, train_score,valid_score))
        train_scores.append(train_score)
        valid_scores.append(valid_score)

    #提交df
    submission_test=pd.DataFrame({"id":test_ids,"pressure":test_predictions})
    #特征重要性
    feature_importance=pd.DataFrame({"feature":feature_names,"importance":feature_importance_values})
    train_scores.append(np.mean(train_scores))
    valid_scores.append(np.mean(valid_score))
    fold_names=list(range(n_fold))
    fold_names.append("overall")
    
    #结果分数
    metrics=pd.DataFrame({"fold":fold_names,"train":train_scores,"valid":valid_scores})
    return submission_test,feature_importance,metrics

In [None]:
submission_test,feature_importance,metrics=lgb_model(df_train,df_test)

In [None]:
display_importances(feature_importance)

In [None]:
metrics

In [None]:
submission_test.to_csv('sumbission.csv', index=False)