In [None]:
import pandas as pd
import numpy as np
import random
import itertools

from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.metrics import mean_squared_error
import optuna

import warnings
warnings.filterwarnings("ignore")

# Data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')

# Optuna tuning
In order to reduce the training time from a few days to several hours, I will take a small sample from a large dataset. For model evaluation, some valid values may be changed to Nan.

In [None]:
SAMPLES_TEST = 100        # 100 valid cells will be changed to Nan
ROWS_TRAIN = (120, 4000)  # Train DF will be composed of 4000 valid rows and 120 rows for each column with Nan values

In [None]:
# some important values

miss_cols = df.columns[df.isna().any()].tolist()
not_nan_rows = df[~df.isna().any(axis=1)].index.tolist()

columns = df.columns.tolist()
miss_col_indx = {col: df[df[col].isna()].index.tolist() for col in miss_cols}

In [None]:
def objective(trial, DF, y_test, idx):

    params = {'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
              'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
              'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
              'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
              'max_depth': trial.suggest_int('max_depth', 4, 14),
              'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
              'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
              'cat_smooth' : trial.suggest_int('cat_smooth', 1, 100),
                
              'random_state': trial.suggest_int('random_state', 300, 1500, 200),
    }

    
    model = LGBMRegressor(metric='rmse', n_estimators=3000, random_state=1500,
                          **dict(itertools.islice(params.items(), 9)))

    imp = IterativeImputer(estimator=model, verbose=2, max_iter=3, 
                           imputation_order='ascending', initial_strategy='mean',
                           **dict(itertools.islice(params.items(), 9, 10)))
    
    new_array = imp.fit_transform(DF)
    predicted = new_array[idx[0], idx[1]]
    
    rmse = mean_squared_error(y_test, predicted)

    return rmse


def optimize(DF, y_test, idx):
    func = lambda trial: objective(trial, DF, y_test, idx)
    study = optuna.create_study(direction='minimize')
    study.optimize(func, show_progress_bar=True, n_trials=50)
    return study

**creating a sample**

In [None]:
def sample():
    idx = []
    for indicies in miss_col_indx.values():
        idx += random.choices(indicies, k=ROWS_TRAIN[0])
    
    idx += random.choices(not_nan_rows, k=ROWS_TRAIN[1])
    
    DF = df.iloc[idx].reset_index(drop=True)

    not_nan_rows_sub = DF[~DF.isna().any(axis=1)].index.tolist()
    
    rows_idx = random.choices(not_nan_rows_sub, k=SAMPLES_TEST)
    cols_name = random.choices(miss_cols, k=SAMPLES_TEST)
    cols_idx = [columns.index(name) for name in cols_name]

    values_test = DF.to_numpy()[rows_idx, cols_idx]
    for r_id, c_name in zip(rows_idx, cols_name):
        DF.at[r_id, c_name] = np.nan
    
    return DF, values_test, [rows_idx, cols_idx]

In [None]:
DF, values_test, idx = sample()

In [None]:
# opt = optimize(DF, values_test, idx)

In [None]:
# I found the following params:
params = {
              'reg_alpha': 0.02,
              'reg_lambda': 0.06,
              'colsample_bytree': 0.4,
              'subsample': 1.0,
              'learning_rate': 0.06,
              'max_depth': 13,
              'num_leaves' : 47,
              'min_child_samples': 248,
              'cat_smooth' : 44,
                
              'random_state': 900,
        }


# Train final model

In [None]:
SAMPLES_TEST_FINAL = 6000 # 6000 valid cells will be changed to Nan, for model evaluating

In [None]:
def sample_final():
    rows_idx = random.choices(not_nan_rows, k=SAMPLES_TEST_FINAL)
    cols_name = random.choices(miss_cols, k=SAMPLES_TEST_FINAL)
    cols_idx = [columns.index(name) for name in cols_name]

    values_test = df.to_numpy()[rows_idx, cols_idx]
    for r_id, c_name in zip(rows_idx, cols_name):
        df.at[r_id, c_name] = np.nan
    
    return values_test, [rows_idx, cols_idx]

In [None]:
y_test, idx = sample_final()

In [None]:
model = LGBMRegressor(metric='rmse', n_estimators=3000, verbose=0, random_state=1500,
                      **dict(itertools.islice(params.items(), 9)))

imp = IterativeImputer(estimator=model, verbose=2, max_iter=5, 
                       imputation_order='ascending', initial_strategy='mean',
                       **dict(itertools.islice(params.items(), 9, 10)))

# new_array = imp.fit_transform(df)
# It took me about 9000 sec / it

In [None]:
# score
# predicted = new_array[idx[0], idx[1]]
# mean_squared_error(y_test, predicted)

In [None]:
# df2 = pd.DataFrame(new_array, columns=columns)

# for i, row_col in enumerate(sub['row-col'].tolist()):
#     row, col = row_col.split('-')
#     sub.at[i, 'value'] = df2.loc[int(row), col]

# sub.set_index('row-col', inplace=True)

In [None]:
# sub.to_csv('sub.csv')