In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_submission = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')
df_submission

In [None]:
pd.set_option('display.max_columns', 100)
df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
df

# Check the correlation

In [None]:
from matplotlib import pylab as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(40, 40))
sns.heatmap(df.corr(), annot=True, fmt='.1f')

* It is better to use the average to replace the missing valuess in F_1 and F_3.
* F_4 has no correlation with F_1, F_2, and F_3

# Modeling with F_4 function only

I took an approach like the MLM model.  
I only need a single model, but to get a high score, the model needs a lot of parameters.

In [None]:
import itertools

def data_generator(df, batch_size, is_train=True):
    df = df.copy()
    while True:
        if is_train:
            df.sample(frac=1)
        x_float = df.loc[:, df.columns.str.startswith('F_4_')]
        for i in range((len(df)-1)//batch_size + 1):
            _x_float = x_float[i*batch_size:(i+1)*batch_size].values
            if is_train:
                # mask = (np.random.rand(*_x_float.shape) > 0.3).astype(float)
                # mask = (np.random.rand(*_x_float.shape) > 0.2).astype(float)
                mask = (np.random.rand(*_x_float.shape) > 0.15).astype(float)
            else:
                mask = np.ones(_x_float.shape)
            flg_nan = np.isnan(_x_float).astype(float)
            mask = mask * (1-flg_nan)  # if flg_nan is 1 then mask is 0.
            if not is_train:
                flg_nan = np.zeros(_x_float.shape)
            x = [np.nan_to_num(_x_float), mask, flg_nan]
            yield x, np.nan_to_num(_x_float)

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

In [None]:
float_dim = sum(df.columns.str.startswith('F_4_'))

dropout_p = 0
n_d = 16  # 2048
n_mul_growth = 1
n_add_growth = n_d
n_depth = 2  # 6
n_bottleneck = 16  # 512
is_concat = True
optimizer = 'nadam'  
reduce_lr_patience = 3  # 15
early_stopping_patience = 5  # 30


inputs = []


float_input = tf.keras.layers.Input(
    shape=(float_dim,), name='float_input'
)
inputs.append(float_input)

mask_input = tf.keras.layers.Input(
    shape=(float_dim,), name='mask_input'
)
inputs.append(mask_input)
embeds_output = tf.keras.layers.Multiply()([float_input, mask_input])

flg_nan_input = tf.keras.layers.Input(
    shape=(float_dim,), name='flg_nan_input'
)
inputs.append(flg_nan_input)


xs = [embeds_output]
x = embeds_output

# Main Network
for i in range(n_depth):
    if is_concat:
        x = tf.keras.layers.Concatenate()(xs)
    if n_bottleneck:
        x = tf.keras.layers.Dense(n_bottleneck, activation='relu')(x)
    x = tf.keras.layers.Dense(n_d * (n_mul_growth ** i) + (n_add_growth * i), activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    if dropout_p:
        x = tf.keras.layers.Dropout(dropout_p)(x)
    xs.append(x)

if is_concat:
    x = tf.keras.layers.Concatenate()(xs)
x = tf.keras.layers.Dense(float_dim)(x)
x = tf.keras.layers.Multiply()([x, tf.ones((float_dim,))-flg_nan_input])  # if missing value, then output is 0. (Also input is 0.)
a1 = tf.keras.layers.Multiply()([float_input, mask_input])  # if mask is 1, then output == input.
a2 = tf.keras.layers.Multiply()([x, tf.ones((float_dim,))-mask_input])  # if mask is 0, then output == x.
output = tf.keras.layers.Add()([a1, a2])

model = tf.keras.models.Model(inputs=inputs, outputs=output)

model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=tf.keras.metrics.RootMeanSquaredError())

In [None]:
model.summary()

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error", patience=early_stopping_patience, restore_best_weights=True)
reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_root_mean_squared_error', factor=0.5, patience=reduce_lr_patience)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='weight.hdf5', 
    monitor='val_root_mean_squared_error',
    verbose=1,
    save_best_only=True,
    save_weights_only=True,
)

In [None]:
np.random.seed(42)
batch_size = 4096
gen_train = data_generator(df_train, batch_size=batch_size)
gen_test = data_generator(df_test, batch_size=batch_size)
model.fit_generator(
    gen_train, 
    steps_per_epoch=(len(df_train)-1)//batch_size + 1, 
    epochs=800,
    validation_data=gen_test,
    validation_steps=5 * ((len(df_test)-1)//batch_size + 1),
    callbacks = [early_stopping, reduce_lr_on_plateau, model_checkpoint]
)

# Predict F_4 missing values

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

In [None]:
del df_train
del df_test
del gen_train
del gen_test

In [None]:
import gc
gc.collect()

In [None]:
np.random.seed(42)
batch_size = 4096
gen_pred = data_generator(df, batch_size=batch_size, is_train=False)
pred = model.predict(gen_pred, steps=(len(df)-1)//batch_size + 1)

In [None]:
pred.shape

# Impute F_1, F_3, F_4

In [None]:
_df_sub = df.select_dtypes('float').copy()
_df_sub.loc[:, _df_sub.columns.str.startswith('F_4_')] = pred
_df_sub

In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(
        missing_values=np.nan,
        strategy='mean')

_df_sub[:] = imp.fit_transform(_df_sub)
_df_sub

# Submission

In [None]:
from tqdm import tqdm 

for i in tqdm(df_submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    df_submission.loc[i, 'value'] = _df_sub.loc[row, col]

df_submission.to_csv('submisson_neural.csv')