In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import svm
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlt
# 前処理
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# XGBoost
import xgboost as xgb

# LightGBM
import lightgbm as lgb

# CatBoost
import catboost as  cb
from catboost import CatBoost, Pool


# 評価指標
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
test = pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")

In [None]:
train.info()

In [None]:
train

In [None]:
features = ["id","breath_id","R","C","time_step","u_in","u_out"]
x_train=train[features]
y_train=train["pressure"]

In [None]:
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True, random_state=71)
cv = list(fold.split(x_train,y_train))#もともとが generator なため明示的にlistに変換する

In [None]:
cv

### パラメータの設定
- 評価指標は平均絶対誤差

In [None]:
params = {
    'objective': "regression",
    'metric': "mae",
    "verbosity" :-1,
}

In [None]:
from sklearn.metrics import mean_absolute_error
import optuna.integration.lightgbm as lgb_o

def fit_lgbm(X, 
             y, 
             cv, 
             params: dict=None, 
             ):
    """lightGBM を CrossValidation の枠組みで学習を行なう function"""

    # パラメータがないときは、空の dict で置き換える
    if params is None:
        params = {}
    
    scores = {}
    models = []
    # training data の target と同じだけのゼロ配列を用意
    oof_pred = np.zeros_like(y, dtype=np.float)

    evaluation_results = []
    #cvした分のモデルのパラメータを保存 
    #all_best_params=[]
    for i, (idx_train, idx_valid) in enumerate(cv): #cvにはtrainとtestのindex番号がidx_trainとidx_validに入る
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        #array型から各cvでのインデックス番号を指定してtrainとtestを作る
        x_train, y_train = X[idx_train], y[idx_train]#xはarray型でdataframe型ではない
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        #lgbのデータセット作成
        lgb_train=lgb.Dataset(x_train,y_train)
        lgb_eval=lgb.Dataset(x_valid,y_valid)#valデータ
        
        #評価関数を保存する
        evaluation_results_i = {} 
        #oputnaでパラメータを保存する
        #best_params = {}

        #学習
        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=100,
            valid_sets=[lgb_train,lgb_eval],#mseの推移を保存する
            evals_result=evaluation_results_i,
            valid_names=['train', 'valid'],
            early_stopping_rounds=10,
            )


        #best_params = gbm.params
        #all_best_params.append(best_params)
    
        #valデータに当てはめて推論
        #pred_iはどんな型??
        pred_i = gbm.predict(x_valid,num_iteration=gbm.best_iteration)#推論
        
        oof_pred[idx_valid] = pred_i#oof_pred(ゼロ配列)のcvした時のcvした時のテストデータのindex番号に予測値を入れる
        models.append(gbm)#モデルをmodel配列に追加
        evaluation_results.append(evaluation_results_i)
        print(f'Fold {i} MAE: {mean_absolute_error(y_valid, pred_i) ** .5:.4f}')

    scores = np.sqrt(mean_absolute_error(y,oof_pred))
        
    #score = mean_squared_error(y, oof_pred) ** .5
    print('-' * 50)
    print('FINISHED | Whole MAE: {:.4f}'.format(scores))
    return oof_pred, models ,evaluation_results

In [None]:
oof, models ,evaluation_results= fit_lgbm(x_train.values,y_train.values,cv, params=params)

In [None]:
def visualize_importance(models, feat_train_df):
    """lightGBM の model 配列の feature importance を plot する
    CVごとのブレを boxen plot として表現します.

    args:
        models:
            List of lightGBM models
        feat_train_df:
            学習時に使った DataFrame
    """
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importance()
        _df['column'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('column')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]#50行目まで抽出

    fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25)))
    sns.boxenplot(data=feature_importance_df, 
                  x='feature_importance', 
                  y='column', 
                  order=order, 
                  ax=ax,
                  palette='viridis', 
                  orient='h')
    ax.tick_params(axis='x', rotation=90)
    ax.set_title('Importance')
    ax.grid()
    fig.tight_layout()
    return fig, ax

fig, ax = visualize_importance(models, x_train)

In [None]:
pred = 0
for i in range(5):
    pred += (models[0].predict(test))/5

In [None]:
submission = pd.DataFrame({"id":test["id"],"pressure":pred})

In [None]:
submission.to_csv("submission.csv",index=False)