In [None]:
import os
import warnings
import pathlib
import gc
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import lightgbm as lgb

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
sns.set(font="IPAexGothic", style="darkgrid")
warnings.simplefilter('ignore')

In [None]:
def split(data:pd.DataFrame, method:str="random", **params):
    """
    Wrapper of train_test_split
    
    Parameters
    -----
    data : pd.DataFrame
        input data
    method : str (default random)
        splitting method
    params : 
        parameters of train_test_split function
        
    Returns
    -----
    train_set, test_set:pd.DataFrame
        split_data
        
    Note
    -----
    * Abount method
        * random : random split
        * breath : split by breath_id
    """
    def random_split(data, **params):
        return train_test_split(data, **params)
    
    def breath_split(data, **params):
        train_id, test_id = train_test_split(data["breath_id"].unique(), **params)
        return data[data["breath_id"].isin(train_id)], data[data["breath_id"].isin(test_id)]
    
    if method == "random":
        return random_split(data, **params)
    elif method == "breath":
        return breath_split(data, **params)
    else:
        raise ValueError(f"Invalid value {method}. method must be random or breath.")
        
def get_feats(data:pd.DataFrame):
    print("target : ", target)
    print("Excluded features : ", excluded_feats)
    return [col for col in data.columns if col not in [target] + excluded_feats]

In [None]:
def gKFold(df, **params):
    df = df.copy().reset_index(drop=True)
    g_kfold = GroupKFold(**params)
    
    df["folds"] = -1
    for fold, (train_idx, test_idx) in enumerate(g_kfold.split(df, groups=df["breath_id"])):
        df["folds"][test_idx] = fold
    
    return df

def reduce_mem_usage(df, verbose=True):
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
SEED = 43

#If true, We use 10% data of train set.
DEBUG = False


In [None]:
date=20211022
train_iter = pd.read_csv(f"../input/google-brain-create-features/train_with_feats_{date}.csv", chunksize=1000000)
train = pd.concat([reduce_mem_usage(tr) for tr in tqdm(train_iter)], axis=0)

submission_iter = pd.read_csv(f"../input/google-brain-create-features/test_with_feats_{date}.csv", chunksize=1000000)
submission = pd.concat([reduce_mem_usage(sub) for sub in tqdm(submission_iter)], axis=0)

print(train.shape)
print(submission.shape)

In [None]:
train["pressure_ratio"] = (train["pressure"] / train.groupby("breath_id")["pressure"].shift(1)).fillna(1)

In [None]:
display(train.head())
#display(train.describe())

#If DEBUG is True, 10% train_valid is used.
if DEBUG:
    breath_id_list = np.unique(train["breath_id"])
    breath_id_list = np.random.choice(breath_id_list, int(len(breath_id_list)*0.1), replace=False)
    train = train[train["breath_id"].isin(breath_id_list)]
else:
    pass

#20 % breath_ids are used for test.
train_valid, test = split(train, method="breath", test_size=0.20, random_state=SEED)

del train
gc.collect()

print(train_valid.shape)
print(test.shape)

In [None]:
n_splits = 3
train_valid = gKFold(train_valid, n_splits=n_splits)

target = "pressure_ratio"
excluded_feats = [
    "id"
    , "breath_id"
    , "folds"
    , "predicts"
    , "pressure"
    , "pressure_ratio"
    , "time_step"
    , "time_step_lag1"
    , "time_step_lag2"
] + [col for col in train_valid.columns if "all" in col]

feats = [col for col in train_valid.columns if col not in excluded_feats]
pprint(feats)

In [None]:
def train_fn(train_valid, test, submission, n_folds, num_boost_round, learning_rate):

    model_list = []
    
    train_valid_preds = np.zeros([len(train_valid)])

    for fold in range(n_folds):
        tr = train_valid.query(f"folds != {fold}")
        va = train_valid.query(f"folds == {fold}")

        lgb_tr = lgb.Dataset(tr.query("u_out == 0")[feats], label=tr.query("u_out == 0")[target])
        #lgb_va = lgb.Dataset(va[feats], label=va[target])
        lgb_va = lgb.Dataset(va.query("u_out == 0")[feats], label=va.query("u_out == 0")[target])

        LGB_PARAMS = {
            "boosting_type":"gbdt"
            , "objective":"huber"
            , "metric":"mae"
            , "max_depth":12
            , "colsample_bytree":0.8
            , "min_data_in_leaf":4000
            , "lambda_l1":0 #reg_alpha(L1)
            , "lambda_l2":10 #reg_lambda(L2)
            , "learning_rate":learning_rate
            , "verbosity":-1
        }

        metrics = {}

        lgb_reg = lgb.train(
            LGB_PARAMS
            , lgb_tr
            , num_boost_round=num_boost_round
            , valid_sets=(lgb_tr, lgb_va)
            , valid_names=("train", "valid")
            , early_stopping_rounds=int(50)
            , verbose_eval=100
            , evals_result=metrics
        )

        #lgb.plot_metric(metrics)

        #fig, ax = plt.subplots(figsize=(12, 8))
        #lgb.plot_importance(lgb_reg, ax=ax)
        #fig.tight_layout()
        #plt.show()
        #plt.close()

        #予測
        print(f"Training of Fold {fold} is finished.")
        print("Predict Valid data")
        pred_va = lgb_reg.predict(va[feats])
        train_valid_preds[train_valid["folds"] == fold] = pred_va

        print("Predict test data")
        pred_test = lgb_reg.predict(test[feats])
        test["predicts"] += pred_test / n_folds
        
        #提出用
        print("Predict submission data")
        pred_sub = lgb_reg.predict(submission[feats])
        submission["predicts"] += pred_sub / n_folds

        model_list.append(lgb_reg)
    
    train_valid["predicts"] = train_valid_preds
    return model_list, train_valid, test, submission

In [None]:
def eval_mae(df):
    mask = df["u_out"] == 0
    eval_df = df[mask][["pressure", "predicts"]]
    return mean_absolute_error(eval_df["pressure"], eval_df["predicts"])

In [None]:

results = {"train_valid":[], "test":[]}
#parameters
learning_rate = 0.5
num_boost_round = 5000



#=====学習
train_valid["predicts"] = 0 
test["predicts"] = 0 
submission["predicts"] = 0

model_list, train_valid, test, submission = train_fn(train_valid, test, submission, n_splits, num_boost_round, learning_rate)

#results["train_valid"].append(eval_mae(train_valid))
#results["test"].append(eval_mae(test))
#print("train_valid", results["train_valid"][-1])
#print("test", results["test"][-1])



In [None]:
for lgb_reg in model_list:
    fig, ax = plt.subplots(figsize=(12, 16))
    lgb.plot_importance(lgb_reg, ax=ax)
    fig.tight_layout()
    plt.show()
    plt.close()


In [None]:
train_valid = train_valid.rename({"predicts":"predicts_ratio"}, axis=1)
test = test.rename({"predicts":"predicts_ratio"}, axis=1)
submission = submission.rename({"predicts":"predicts_ratio"}, axis=1)

### 学習

In [None]:
n_splits = 3
train_valid = gKFold(train_valid, n_splits=n_splits)

target = "pressure"
excluded_feats = [
    "id"
    , "breath_id"
    , "folds"
    , "predicts"
    , "pressure"
    , "pressure_ratio"
    , "time_step"
    , "time_step_lag1"
    , "time_step_lag2"
] + [col for col in train_valid.columns if "all" in col]

feats = [col for col in train_valid.columns if col not in excluded_feats]
pprint(feats)

In [None]:
results = {"train_valid":[], "test":[]}
#parameters
learning_rate = 0.1
num_boost_round = 5000


#=====学習
train_valid["predicts"] = 0 
test["predicts"] = 0 
submission["predicts"] = 0

model_list, train_valid, test, submission = train_fn(train_valid, test, submission, n_splits, num_boost_round, learning_rate)

results["train_valid"].append(eval_mae(train_valid))
results["test"].append(eval_mae(test))
print("train_valid", results["train_valid"][-1])
print("test", results["test"][-1])



In [None]:
pd.concat([
    train_valid[["id", "breath_id", "predicts_ratio", "predicts"]]
    , test[["id", "breath_id", "predicts_ratio", "predicts"]]
]).to_csv("./lgbm_predicts_train.csv", index=False)

submission.to_csv("./lgbm_predicts_test.csv", index=False)

In [None]:
del train_valid, test
gc.collect()

In [None]:
sample_submission = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv").drop("pressure", axis=1)

sample_submission = sample_submission.merge(submission[["id", "predicts"]], on="id")

sample_submission.columns = ["id", "pressure"]

sample_submission.to_csv("submission.csv", index=False)