# Idia

**important**  
IterativeImputer(LGB, XGB, etc.)  
Increase the number of num_boost_round (LGB)  
: The more you increase, the better the accuracy. I don't know what the upper limit is.  
  
**not important**  
IterativeImputer(BayesianRidge(default))  
Increase the number of max_iter (IterativeImputer)

In [None]:
# base
from pathlib import Path
import pandas as pd
import numpy as np
import random
import os
import tensorflow as tf

# CV
from sklearn.model_selection import KFold, StratifiedKFold

# Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# linear
from sklearn.linear_model import LinearRegression

# LGB
import lightgbm as lgb
from lightgbm import LGBMRegressor

# metrics
from sklearn.metrics import mean_squared_error

# plot
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mno
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed=2022
set_seed(seed)

# Data

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')
submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [None]:
mno.matrix(df, figsize = (20, 5))

In [None]:
missing_columns = [c for c in df.columns if 'F_2' not in c and c != 'row_id']

# Iterative Imputation (LightGBM)
Reference from  
https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html  
https://scikit-learn.org/stable/modules/impute.html#iterative-imputer 

In [None]:
base_model = LGBMRegressor(num_boost_round=1000, random_state = seed)

lgb_iterative_imp = IterativeImputer(
                       estimator=base_model,
                       max_iter=20, 
                       initial_strategy='mean', # {'mean', 'median', 'most_frequent', 'constant'}
                       imputation_order='ascending', # {‘ascending’, ‘descending’, ‘roman’, ‘arabic’, ‘random’}
                       verbose=1,
                       random_state=seed)

lgb_iterative_imp_df = pd.DataFrame(lgb_iterative_imp.fit_transform(df), columns=df.columns)

In [None]:
mno.matrix(lgb_iterative_imp_df, figsize = (20,5))

# View the prediction error for each feature
View RMSE when each feature is used as the objective variable

In [None]:
# lightgbm
class ModelLgb:

    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {
        'objective':'regression',
        'metric':'rmse',
        'seed': seed,
        'verbosity':-1,
        'learning_rate':0.1,
        }
        
        num_round = 1000
        early_stopping_rounds=10
        
        lgb_train = lgb.Dataset(tr_x, tr_y)
        lgb_eval = lgb.Dataset(va_x, va_y)
        
        self.model = lgb.train(params, lgb_train, valid_sets=lgb_eval, 
                               num_boost_round=num_round, early_stopping_rounds=early_stopping_rounds,
                               verbose_eval=-1
                              )
        
#         lgb.plot_importance(self.model, figsize=(20,30))
        
    def predict(self, x):
        pred = self.model.predict(x, num_iteration=self.model.best_iteration)
        return pred

In [None]:
# run model & make prediction feature
def mk_predict(model, train_x, train_y):
    
    set_seed(seed)
    
    va_preds = []
    va_idxes = []
    
    rmses = []
    
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    for i, (tr_idx, va_idx) in tqdm(enumerate(kf.split(train_x))):
        
        print('='*15 + f'fold{i+1}' + '='*15)

        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model.fit(tr_x, tr_y, va_x, va_y)

        # valid predict & index
        va_pred = model.predict(va_x)
        va_preds.append(va_pred)
        va_idxes.append(va_idx)
        
        # valid loss
        va_rmse = np.sqrt(mean_squared_error(va_pred, va_y))
        print(f'RMSE : {va_rmse}')
        rmses.append(va_rmse)
        
    # sort valid pred    
    va_idxes = np.concatenate(va_idxes)
    va_preds = np.concatenate(va_preds, axis=0)
    order = np.argsort(va_idxes)
    train_preds = va_preds[order]
    
    # mean RMSE
    mean_rmse = np.mean(rmses)
    print(f'Mean RMSE : {mean_rmse}')
         
    return train_preds, mean_rmse

In [None]:
%%time
pred_df = lgb_iterative_imp_df.copy()
rmse_dict = {}
for target_col in tqdm(lgb_iterative_imp_df.columns[1:]):
    print('#'*15 + target_col + '#'*15)
    train_x = lgb_iterative_imp_df.drop(['row_id', target_col], axis=1)
    train_y = lgb_iterative_imp_df[target_col]
    model_lgb = ModelLgb()
    pred_df[target_col], rmse = mk_predict(model_lgb, train_x, train_y)
    rmse_dict[target_col] = rmse

In [None]:
# View RMSE when each feature is used as the objective variable
data = rmse_dict
names = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (60, 20))

plt.bar(range(len(data)), values, tick_label=names)
plt.title('RMSE when each feature is used as the objective variable')
plt.xlabel('Features')
plt.ylabel('RMSE')
plt.show()

# Submit

In [None]:
# Iterative Imputation (LightGBM)
data = lgb_iterative_imp_df.set_index('row_id')

for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

sub.value= submission.value.values
sub.to_csv('Submission_lgb_iterative_imp.csv', index=False)

In [None]:
# Iterative Imputation (LightGBM) & Predict again
data = pred_df.set_index('row_id')

for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

sub.value= submission.value.values
sub.to_csv('Submission_lgb_pred.csv', index=False)