In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Read data

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv')

In [None]:
F_4_cols = data.columns[data.columns.str.contains('F_4_')]
F_3_cols = data.columns[data.columns.str.contains('F_3_')]
F_2_cols = data.columns[data.columns.str.contains('F_2_')]
F_1_cols = data.columns[data.columns.str.contains('F_1_')]

numeric_col = data.columns[data.dtypes == 'float64']
categorical_col = data.columns[data.dtypes == 'int64'].drop('row_id')

In [None]:
fig, axs = plt.subplots(nrows=4, ncols=1, figsize=(18, 30))

for i, col in enumerate([F_1_cols, F_2_cols, F_3_cols, F_4_cols]):
    temp = data[col]
    corr = temp.corr()
    sns.heatmap(corr, ax=axs[i-1], annot=True)

# Simple imputation with regressor for F_4 columns

In [None]:
missing_data_cols = np.concatenate([F_1_cols, F_3_cols, F_4_cols])

### Check the % missing values in each columns

In [None]:
pd.DataFrame([data[missing_data_cols].isna().mean()]).T.plot(kind='barh', figsize=(15, 10))

Approx 1.8% values is missing in each columns

### Create validation set from non-missing data to evaluate imputation approaches. We will try to sample 1.8% of values in each column as validation data
- For each approach, experiment we can test their performances by using this validation set
- Impute F4 columns with regressor, first we try to experiment with different configs such as input features, and model parameters and compare them with validation data

In [None]:
validation_index_f4 = dict()  # indexes use for validation, we will use regressor to impute F_4 columns only
for col in F_4_cols:
    selected_index = data[~data[col].isnull()].sample(frac=0.018, random_state=123).index.values
    validation_index_f4[col] = selected_index

### Lightgbm regressor

In [None]:
def baseline_regressor_scoring(data, numeric_col, validation_index, fillna=None):
    scoring_dict = dict()
    for col_name, index in validation_index.items():
        print('Start f: ', col_name) 
        learning_rate = 0.1  # try different lr for experiment
        
        # train data
        col_data = data[~data[col].isnull()]
        train_data = col_data[~col_data.index.isin(index)]
        X_train = train_data[numeric_col.drop(col_name)].copy()
        y_train = train_data[col_name]

        # validation data on non-missing values
        val_data = data.iloc[index]
        X_val = val_data[numeric_col.drop(col_name)].copy()
        y_val = val_data[col_name]
        
        if fillna is not None:  # fillna with a value for LGBMRegressor 
            X_train.fillna(-999, inplace=True)
            X_val.fillna(-999, inplace=True)

        # fit model
        reg = lgb.LGBMRegressor(n_estimators=2000, device='gpu', learning_rate=learning_rate, objective='mean_squared_error', n_jobs=4)
        reg.fit(X_train, y_train)
        
        scoring_dict[col_name] = [reg.score(X_val, y_val)]
    return scoring_dict

In [None]:
# use only F_4 features to predict F_4 columns
lightgbm_regressor = baseline_regressor_scoring(data, F_4_cols, validation_index_f4)
lightgbm_regressor = pd.DataFrame(lightgbm_regressor)

### NN regressor

In [None]:
def create_nn_model(input_shape):
    model = keras.Sequential([keras.layers.Input(input_shape),
                          keras.layers.Dense(64, activation='relu'),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dense(256, activation='relu'),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dense(256, activation='relu'),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dense(64, activation='relu'),
                          keras.layers.BatchNormalization(),
                          keras.layers.Dense(1, activation='linear')])

    model.compile(loss=keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(0.001))
    return model

In [None]:
def train_nn_reg(data, numeric_col, validation_index, scaler, dropna=True, fill_value=0):
    scoring_dict = dict()
    for col_name, index in validation_index.items():
        print('Start f: ', col_name)
        
        # train data
        col_data = data[~data[col_name].isnull()]
        train_data = col_data[~col_data.index.isin(index)].copy()
        
        # validation data on non-missing values
        val_data = data.iloc[index].copy()
        if dropna:
            train_data.dropna(inplace=True)
            val_data.dropna(inplace=True)
            
        X_train = train_data[numeric_col.drop(col_name)].copy()
        y_train = train_data[col_name]

        X_val = val_data[numeric_col.drop(col_name)].copy()
        y_val = val_data[col_name]
    
        if not dropna:
            X_train.fillna(fill_value, inplace=True)
            X_val.fillna(fill_value, inplace=True)
        
        X_train = X_train.values
        X_val = X_val.values
        if scaler is not None:
            scaler = scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_val = scaler.transform(X_val)
        
        model = create_nn_model(X_train.shape[-1])
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min',
                                              patience=5, restore_best_weights=True)
 
        model.fit(X_train, y_train, batch_size=2048, epochs=20, shuffle=True, verbose=0, 
                  validation_split=0.1, callbacks=[early_stop])
        scoring_dict[col_name] = [r2_score(y_val, model.predict(X_val).flatten())]
        print(scoring_dict[col_name])
    print('-'*40)
    return scoring_dict

In [None]:
nn_baseline = train_nn_reg(data, F_1_cols.append([F_2_cols, F_3_cols, F_4_cols]),
                           validation_index_f4, scaler=None, dropna=True)
nn_baseline = pd.DataFrame(nn_baseline)

In [None]:
nn_baseline_norm = train_nn_reg(data, F_1_cols.append([F_2_cols, F_3_cols, F_4_cols]),
                           validation_index_f4, scaler=StandardScaler(), dropna=True)
nn_baseline_norm = pd.DataFrame(nn_baseline_norm)

In [None]:
nn_baseline_no_scale_keepna = train_nn_reg(data, F_1_cols.append([F_2_cols, F_3_cols, F_4_cols]),
                           validation_index_f4, scaler=None, dropna=False, fill_value=0)
nn_baseline_no_scale_keepna = pd.DataFrame(nn_baseline_no_scale_keepna)

Different methods result can be visualized here

In [None]:
scoring = lightgbm_regressor[F_4_cols].append(nn_baseline)
scoring = scoring.append(nn_baseline_norm)
scoring = scoring.append(nn_baseline_no_scale_keepna)
scoring.reset_index(drop=True, inplace=True)
scoring['label'] = ['lightgbm', 'nn baseline', 'nn baseline norm', 'nn baseline keepna']
scoring = scoring.melt(['label'], var_name='col', value_name='score')

In [None]:
F4_impute_configs = scoring.loc[scoring[scoring.label.isin(['nn_baseline_no_scale_keepna','lightgbm'])].groupby('col')['score'].idxmax()]

In [None]:
sns.catplot(data=scoring, kind="bar", x="col", y="score", palette='dark', hue="label", alpha=.6, height=5, aspect=2)

In [None]:
# impute F4 columns with configs found above
def train_reg(data, numeric_col, impute_cols):
    imputed_data = data.copy()
    
    for col in impute_cols:    
        print('Start training: ', col)
        # get train_data by column and cluster
        temp_data = data[~data[col].isnull()]        
        pred_data = data[data[col].isnull()]
        
        X = temp_data[numeric_col.drop(col)]
        y = temp_data[col]
        
        reg = lgb.LGBMRegressor(n_estimators=50000, device='gpu', metric='rmse', n_jobs=-1)
        reg.fit(X, y)
        
        imputed_data.loc[pred_data.index, col] = reg.predict(pred_data[numeric_col.drop(col)])

        print('Training score {}: '.format(col), reg.score(X, y))
        print('-'*40)
    return imputed_data[impute_cols]

In [None]:
impute_cols_lightgbm = F4_impute_configs[F4_impute_configs.label == 'lightgbm']['col'].values
imputed_f4_data_lightgbm = train_reg(data, numeric_col=F_4_cols, impute_cols=impute_cols_lightgbm)

In [None]:
def train_nn_reg_pred(data, numeric_col, impute_cols):
    imputed_data = data.copy()
    
    for col in impute_cols:       
        print('Start training: ', col)
        # get train_data by column and cluster
        temp_data = data[~data[col].isnull()]        
        pred_data = data[data[col].isnull()]
        
        X = temp_data[numeric_col.drop(col)].fillna(0)
        y = temp_data[col]
        
        model = create_nn_model(X.shape[-1])
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min',
                                              patience=5, restore_best_weights=True)
                                              
        folds = KFold(n_splits=5, shuffle=True, random_state=123).split(X, y)                                              
        for j, (train_idx, val_idx) in enumerate(folds):
            print('\nFold ',j)

            x_fold_train = X.values[train_idx]
            x_fold_valid = X.values[val_idx]
            y_fold_train = y.values[train_idx]
            y_fold_valid = y.values[val_idx]
            model.fit(x_fold_train, y_fold_train, batch_size=2048, epochs=30, shuffle=True, verbose=0, 
                      validation_data=(x_fold_valid, y_fold_valid), callbacks=[early_stop])
            
        print('Training score {}: '.format(col), r2_score(y, model.predict(X).flatten()))
        
        imputed_data.loc[pred_data.index, col] = model.predict(pred_data[numeric_col.drop(col)].fillna(0)).flatten()
        
    return imputed_data[impute_cols]

In [None]:
impute_cols_nn = F4_impute_configs[F4_impute_configs.label == 'nn_baseline_no_scale_keepna']['col'].values
imputed_f4_data_nn = train_nn_reg_pred(data, numeric_col=pd.Index(np.concatenate([F_4_cols, F_1_cols, F_3_cols, F_2_cols])), impute_cols=F_4_cols)

In [None]:
imputed_data = data.copy()
imputed_data.loc[:, impute_cols_lightgbm] = imputed_f4_data_lightgbm
imputed_data.loc[:, F_4_cols] = imputed_f4_data_nn

 - Impute F1, F3 columns with basics methods

In [None]:
validation_index_F1_F3 = dict()
for col in np.concatenate([F_1_cols, F_3_cols]):
    selected_index = data[~data[col].isnull()].sample(frac=0.018, random_state=123).index.values
    validation_index_F1_F3[col] = selected_index

In [None]:
# find the best methods can be used for imputation F_1, F_3 columns
def basic_impute_scoring(data, validation_index):
    scoring_dict = dict()
    for col_name, index in validation_index.items():
        val_data = data.iloc[index][col_name]        
        median_score = np.sqrt(mean_squared_error(val_data, [data[col_name].median()]*len(val_data)))
        mean_score = np.sqrt(mean_squared_error(val_data, [data[col_name].mean()]*len(val_data)))
        zero_score = np.sqrt(mean_squared_error(val_data, [0]*len(val_data)))
        
        scoring_dict[col_name] = [median_score, mean_score, zero_score]
    return scoring_dict

In [None]:
basic_impute = basic_impute_scoring(data, validation_index_F1_F3)
basic_impute = pd.DataFrame(basic_impute)
basic_impute['label'] = ['median', 'mean', 'zeros']
basic_impute = basic_impute.melt('label', var_name='col')
basic_impute

In [None]:
impute_configs = basic_impute.loc[basic_impute.groupby('col')['value'].idxmin()][['label', 'col']].values
impute_configs

In [None]:
for method, col in impute_configs:
    if method == 'zeros':
        imputed_data.loc[imputed_data[col].isnull(), col] = 0
    elif method == 'median':
        imputed_data.loc[imputed_data[col].isnull(), col] = imputed_data[col].median()
    elif method == 'mean':
        imputed_data.loc[imputed_data[col].isnull(), col] = imputed_data[col].mean()

# Generate submission file
- Generate function is taken from [here](https://www.kaggle.com/code/ygorana/tps-june-2022-ultra-fast-submissions-50s)

In [None]:
def generate_submission(source_df: pd.DataFrame, output_df: pd.DataFrame) -> pd.DataFrame:
    # Melt source dataframe filtered on NaN values to form [row_id, col, isNull] ...
    # ... with MultiIndex on (row_id, col)
    nan_only = (source_df.isna().melt(ignore_index=False, var_name='col', value_name='isNull')
                .query('isNull == True')
                .set_index(['col'], append=True))

    # Melt output dataframe to form [row_id, col, value] with MultiIndex on (row_id, col)
    out = (output_df.melt(ignore_index=False, var_name='col').set_index(['col'], append=True))

    # Filter output's MultiIndex on nan_only's MultiIndex
    out = (out.loc[nan_only.index].sort_index())
    
    # Flatten MultiIndex to Index & rename to desired column
    out.index = [f'{r}-{c}' for r, c in out.index]
    out.index.name = 'row-col'
    return out

In [None]:
submission = generate_submission(data, imputed_data).reset_index()

In [None]:
submission.to_csv('submission.csv', index=False)