This kernel demonstrates a way of using LightGBM with GPU support in Kaggle kernels.
It has been updated from the original IEEE Fraud detection kernel to work with newer lightgbm versions.



In [None]:
! apt-get install --no-install-recommends git cmake build-essential libboost-dev libboost-system-dev libboost-filesystem-dev -y

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

### Build and re-install LightGBM with GPU support

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
# Latest Pandas version
#!pip install -q 'pandas==0.25' --force-reinstall#

## LightGBM GPU Installation

In [None]:
import lightgbm as lgb

In [None]:
#model = lgb.LGBMRegressor(device_type='gpu')

In [None]:
import pandas as pd

In [None]:
#train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')

In [None]:
#X_train, y_train = train.drop('pressure',axis=1) , train['pressure']

In [None]:
#model.fit(X_train,y_train)

## FE

In [None]:
DEBUG = False

train = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

if DEBUG:
    train = train[:80*1000]

In [None]:
#train = train.head(80000)

In [None]:
import numpy as np

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['step_num'] = df.groupby('breath_id')['id'].cumcount()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df['u_in_lag5'] = df.groupby('breath_id')['u_in'].shift(5)
    df['u_out_lag5'] = df.groupby('breath_id')['u_out'].shift(5)
    df['u_in_lag_back5'] = df.groupby('breath_id')['u_in'].shift(-5)
    df['u_out_lag_back5'] = df.groupby('breath_id')['u_out'].shift(-5)
    df['u_in_lag6'] = df.groupby('breath_id')['u_in'].shift(6)
    df['u_out_lag6'] = df.groupby('breath_id')['u_out'].shift(6)
    df['u_in_lag_back6'] = df.groupby('breath_id')['u_in'].shift(-6)
    df['u_out_lag_back6'] = df.groupby('breath_id')['u_out'].shift(-6)
    
    
    
    df = df.fillna(0)
    df = reduce_mem_usage(df)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['breath_id__u_out__first'] = df.groupby(['breath_id'])['u_out'].transform('first')
    df['breath_id__u_out__difffirst'] = df.groupby(['breath_id'])['u_out'].transform('first') - df['u_out']
    
    df = df.fillna(0)
    
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__first'] = df.groupby(['breath_id'])['u_in'].transform('first')
    
    
    df = df.fillna(0)
    df = reduce_mem_usage(df)
     
    
    df['breath_id__u_in__diff_first'] = df.groupby(['breath_id'])['u_in'].transform('first')- df['u_in']
    
    df['breath_id__time__diff'] = df.groupby(['breath_id'])['time_step'].diff()
    df['breath_id__time__diff_2step'] = df.groupby(['breath_id'])['breath_id__time__diff'].transform(lambda s: s.rolling(2).sum())
    df['breath_id__time__diff_3step'] = df.groupby(['breath_id'])['breath_id__time__diff'].transform(lambda s: s.rolling(3).sum())#.apply(lambda x: x['breath_id__time__diff'].rolling(3).sum())
    
    df['breath_id__u_in__derivative'] = df['u_in_diff1'] / df['breath_id__time__diff']
    df['breath_id__u_in__derivative_2step'] = df['u_in_diff2'] / (df['breath_id__time__diff_2step'])
    df['breath_id__u_in__derivative_3step'] = df['u_in_diff3'] / (df['breath_id__time__diff_3step'])
    
    
    df['breath_id__time__u_out__crossover_max'] = df.groupby(['breath_id'],group_keys=False).apply(lambda x: x.loc[x['u_out']==1,'time_step'].min())
    df['breath_id__time__u_out__crossover_min'] = df.groupby(['breath_id'],group_keys=False).apply(lambda x: x.loc[x['u_out']==0,'time_step'].max())
    
    
    df['breath_id__time__u_out__crossover_max__diff_time'] = df['time_step']-df['breath_id__time__u_out__crossover_max']
    df['breath_id__time__u_out__crossover_min__diff_time'] = df['time_step']-df['breath_id__time__u_out__crossover_min']
    
    
    df['breath_id__step_num__u_out__crossover_max'] = df.groupby(['breath_id'],group_keys=False).apply(lambda x: x.loc[x['u_out']==1,'step_num'].min())
    df['breath_id__step_num__u_out__crossover_min'] = df.groupby(['breath_id'],group_keys=False).apply(lambda x: x.loc[x['u_out']==0,'step_num'].max())
    
    
    df['breath_id__step_num__u_out__crossover_max__diff_time'] = df['step_num']-df['breath_id__step_num__u_out__crossover_max']
    df['breath_id__step_num__u_out__crossover_min__diff_time'] = df['step_num']-df['breath_id__step_num__u_out__crossover_min']
    
    
    
    
    
    df['breath_id__u_in__integral'] = df.groupby(['breath_id'],group_keys=False).apply(lambda x: (x['breath_id__time__diff'] *x['u_in'] ).cumsum())
    
    df = df.fillna(0)
    df = reduce_mem_usage(df)
    
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df
# Add as a feature the time of the crossing (u_out becomes 1) , time since the crossing, duration of the crossing
# distance to the max and to the min in time.
train = add_features(train)
test = add_features(test)

In [None]:
pd.__version__

In [None]:
targets = train[['pressure']].to_numpy()
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

In [None]:
#test = test.drop(['id', 'breath_id'], axis=1)

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
from joblib import dump, load
import gc
from sklearn.model_selection import KFold

In [None]:
NUM_FOLDS=5

In [None]:
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2021)
test_preds = []
for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    X_train, X_valid = train.iloc[train_idx], train.iloc[test_idx]
    y_train, y_valid = targets[train_idx], targets[test_idx]
   
    train_dataset=lgb.Dataset(X_train,y_train)
    val_dataset = lgb.Dataset(X_valid,y_valid)
    
    del X_train, X_valid
    del y_train, y_valid
    params = dict(first_metric_only=True,learning_rate=0.5,n_estimators=700,num_leaves=256,
                  min_gain_to_split=0.1,device_type='gpu',objective='huber',boosting='goss', metric = 'huber,l1')
    model=lgb.train(params=params,train_set=train_dataset,valid_sets=[val_dataset,train_dataset],valid_names=['val','train'])
    
    #model = lgb.LGBMRegressor(learning_rate=0.05,n_estimators=700,num_leaves=128,min_gain_to_split=0.1,device_type='gpu',objective='regression_l1')
    #model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)],eval_names=['train','val'],early_stopping_rounds=15)
    
    
    dump(model,f'model_lgb_{fold}.joblib')
    test_preds.append(model.predict(test))
    del model
    gc.collect()