thanks

https://www.kaggle.com/code/djustin/lgb-cpu/notebook?scriptVersionId=97279087

In [None]:
%%capture
# https://www.kaggle.com/code/abhishek/running-lightgbm-on-gpu/notebook
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
%%capture
!cd LightGBM/python-package/;python setup.py install --precompile
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

### setup

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict
import gc

In [None]:
class CFG:
    n_estimators=10000
    loop_end = 5
    debug = False

### read data

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [None]:
# TODO: これありとなしとでスコアを比較する
# stack_df = pd.read_pickle("../input/tps2206-stack/result_df.pickle")
# stack_df.reset_index(inplace=True)
# data[['stack1', 'stack2', 'stack3', 'stack4', 'stack5']] = stack_df[['stack1', 'stack2', 'stack3', 'stack4', 'stack5']]
# del stack_df
# gc.collect()

In [None]:
na_columns_list = data.columns.to_list()
# Process from columns with high NA
na_columns_list.sort(key=lambda x: data[x].isna().sum(), reverse=True)
while data[na_columns_list[-1]].isna().sum() == 0:
    na_columns_list.pop()

In [None]:
# create missing list for loop
# after filling NA, score is expectated to increase
# however, it is important to combine with exectuion time
missing_list_of = {}
no_missing_list_of = {}
for col in na_columns_list:
    missing_list = list(np.where(data[col].isnull())[0])
    no_missing_list = list(np.where(data[col].isnull() == False)[0])
    missing_list_of[col] = missing_list
    no_missing_list_of[col] = no_missing_list

In [None]:
# TODO: naをほかの人の良いスコアを使って先に埋めちゃう

In [None]:
if CFG.debug:
    na_columns_list = na_columns_list[:2]

### lgbm train and predict

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import time
import pickle

st_time = time.time()
for loop in range(CFG.loop_end):
    print("="*10, "loop:{}/{}".format(loop+1, CFG.loop_end), "="*10)
    for i, col in enumerate(na_columns_list):
        print("="*10, col, "="*10)
        print("{}/{}".format(i+1, len(na_columns_list)))
        missing_list = missing_list_of[col]
        no_missing_list = no_missing_list_of[col]

        train = data.iloc[no_missing_list,]
        test = data.iloc[missing_list,]
        X = train.drop([col,'row_id'],axis=1)
        y = train[col]
        X_test = test.drop([col,'row_id'],axis=1)

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
        param = {
            "objective": "regression",
            "metric": "rmse",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "extra_trees": True,
            "n_estimators": CFG.n_estimators,
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0,
        }
        callbacks = [
            lgb.log_evaluation(250),
            lgb.early_stopping(20),
        ]

        model = lgb.train(param, lgb_train, valid_sets=[lgb_valid], callbacks=callbacks)

        y_predict = model.predict(X_test, num_iteration=model.best_iteration)
        # impute to data
        data_all = data[col]  # ref
        data_all.iloc[missing_list,] = y_predict

        if loop == CFG.loop_end-1:
            lgb.plot_importance(model, max_num_features=30, title="{} Features Importance".format(col))
        en_time = time.time()
        print("elapse", en_time-st_time)
        gc.collect()

In [None]:
for i in tqdm(sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sub.loc[i, 'value'] = data.loc[row, col]

In [None]:
sub.to_csv("submission.csv", index=True)
sub