## JPS-June-2022: Imputation with 55 LGBM Regressors
In this notebook, I will create 55 LGBM Regressors to predict missing value because there are 55 columns with missing value. For each column with missing value, other columns will be feature columns and current column will be target column; rows with missing value of target column will be test data otherwise train data; in training phase I will also use train test split strategy.

In [None]:
import numpy as np
import glob
import gc
import pandas as pd
import lightgbm as lgb
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
train = pd.read_pickle(glob.glob("../input/**/data.pkl")[0])
train.head()

## Find all columns with missing value

In [None]:
null_columns = []
all_columns = list(train.columns)
for column in train.columns:
    is_null = train[column].isnull().value_counts()
    if is_null[0] != 1000000:
         null_columns.append(column)
print("Columns with missing value:", null_columns)
print("Number of columns:", len(null_columns))

In [None]:
%%time
begin = time.time()
results = dict()
rmse_scores = []
for i, target_column in enumerate(null_columns):
    feature_columns = [column for column in all_columns if column not in [target_column, "row_id"]]
    null_rows = pd.isna(train[target_column])
    train_data = train[null_rows == False]
    test_data = train[null_rows == True]
    if len(test_data) == 0:
        print(f"no missing value for {target_column}")
        continue
    train_df, valid_df = train_test_split(train_data, test_size=0.1, shuffle=True, random_state=42)
    params = {
        "objective": "rmse",
        "learning_rate": 0.03,
        "num_iterations": 1000,
        "early_stopping_round": 400
    }
    lgbm = lgb.LGBMRegressor(**params)
    x_val = valid_df[feature_columns]
    y_val = valid_df[target_column]
    lgbm.fit(train_df[feature_columns], train_df[target_column], eval_set=(x_val, y_val), verbose=-1)
    y_val_pred = lgbm.predict(x_val)
    rmse_score = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_scores.append(rmse_score)
    print(f"RMSE Score:{rmse_score:.3f}")
    y_pred = lgbm.predict(test_data[feature_columns]).reshape(-1) 
    train.loc[null_rows == True, target_column] = y_pred
    name = f"lgbm_" + target_column + ".pkl"
    joblib.dump(lgbm, name)
    elapsed = time.time() - begin
    estimated = elapsed / (i + 1) * len(null_columns)
    print(f"{elapsed:.2f}s/{estimated:.2f}s")
    gc.collect()
oof = np.mean(rmse_scores)
print(f"OOF:{oof}")

## Submission

In [None]:
%%time
submission = pd.read_csv(glob.glob("../input/**/sample_submission.csv")[0])  
def post_processing(item):
    items = item.split("-")
    row = int(items[0])
    column = items[1]
    return train.iloc[row][column]
submission['value'] = submission['row-col'].apply(post_processing)
submission.to_csv("submission.csv", index=False)
submission.head()