In [None]:
# Read data

import pandas as pd
from pathlib import Path

input_path = Path('../input/tabular-playground-series-jun-2022/')
data = pd.read_csv(input_path / 'data.csv', index_col='row_id')

sample_submission = pd.read_csv(
    input_path / 'sample_submission.csv',
    index_col='row-col'
)

print(data.shape)
data.head()


In [None]:
# create imputed data.
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer


def create_imputed_data(
    imputer, base_data: pd.DataFrame, name: str
) -> pd.DataFrame:
    imputed_data = pd.DataFrame(
        imputer.fit_transform(base_data.copy()),
        columns=base_data.columns,
        index=base_data.index
    )
    print("finish create impute by {}.".format(name))
    return imputed_data


imputers = {
    'BayesianRidge': IterativeImputer(),
    'RandomForest': IterativeImputer(RandomForestRegressor()),
    'AdaBoostRegressor': IterativeImputer(AdaBoostRegressor()),
    'SimpleMean': SimpleImputer(missing_values=np.nan, strategy='mean')
}

# key = 'BayesianRidge'
# key = 'RandomForest'
# key = 'AdaBoostRegressor'
key = 'SimpleMean'

imputed_data = create_imputed_data(imputers[key], data, key)


In [None]:

columns_with_missing = [
    column for column in data.columns.values
    if data[column].isnull().values.any()
]

# Make new columns indicating what was missing.
imputed_data_with_flag = imputed_data.copy()
for col in columns_with_missing:
    imputed_data_with_flag[col + '_was_missing'] = data[col].isnull()

imputed_data_with_flag.head()


In [None]:
# predict missing value by LightGBM

from sklearn.model_selection import train_test_split
import lightgbm as lgb


def create_lgb_model(train: pd.DataFrame, predict_column: str) -> lgb.Booster:
    x_train, x_test, y_train, y_test = train_test_split(
        train.drop(columns=[predict_column]),
        train.loc[:, predict_column],
        test_size=0.25
    )

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

    params = {
        'objective': 'regression_l2',
        'force_col_wise': True,
    }

    return lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        num_boost_round=1000,
        callbacks=[lgb.log_evaluation(10), lgb.early_stopping(10)]
    )


def predict_missing_value(
    column: str,
    imputed_data_with_flag_for_predict: pd.DataFrame
) -> pd.DataFrame:
    predict_data = imputed_data_with_flag_for_predict.copy()
    predict_data[column] = data.loc[:, column]

    train = predict_data[(predict_data[column].notnull())]
    model = create_lgb_model(train, column)

    valid = predict_data[(predict_data[column].isnull())]
    predict = model.predict(
        valid.drop(columns=[column]),
        num_iteration=model.best_iteration
    )

    print("finish predict : {}".format(column))

    return pd.DataFrame(predict, index=valid.index, columns=[column])


predict_dictionary = {
    column: predict_missing_value(column, imputed_data_with_flag)
    for column in columns_with_missing
}


In [None]:
# predict missing value by predict data
from tqdm import tqdm
from typing import Dict


def predict_by_predict(predict_dic: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:
    data_for_predict = imputed_data_with_flag.copy()

    for column in predict_dic:
        target_data = data[(data[column].isnull())]
        for i in tqdm(target_data.index):
            predict_value = predict_dic[column].loc[i, column]
            data_for_predict.loc[i, column] = predict_value

    return {
        column: predict_missing_value(column, data_for_predict)
        for column in columns_with_missing
    }


for _ in range(3):
    predict_dictionary = predict_by_predict(predict_dictionary)


In [None]:
# Create submission file

from tqdm import tqdm

for i in tqdm(sample_submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    sample_submission.loc[i, 'value'] = predict_dictionary[col].loc[row, col]

sample_submission.to_csv('impute_and_predict.csv')
