Thank you @ehekatlact for sharing the idea of using the na count of each record, and @hiro5299834 and @edrickkesuma for the great improvements, please upvote the following notebooks:
* https://www.kaggle.com/code/ehekatlact/tps2206-the-na-count-of-each-record-is-critical
* https://www.kaggle.com/code/hiro5299834/tps-jun-2022-pytorch-lightning-with-na-counts
* https://www.kaggle.com/code/edrickkesuma/np-random-top-public-notebook

Since the notebooks above were made using PyTorch, I thought it would be very interesting and fun to create a TensorFlow implementation of this idea. The only modifications I made besides the translation to TensorFlow were:
* Using weight normalization in the dense layers.
* Pre-calculating a matrix of negative ones whose rows are randomly shuffled through "tf.keras.utils.Sequence" at each epoch.

I tried to make it as simple as possible. I hope you find it as interesting as I did. Thank you for reading!

In [None]:
import os, random
import numpy as np, pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Tensorflow:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras import layers as L
from tensorflow_addons.activations import mish
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint

In [None]:
# Input:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv',index_col=0)
sample_submission = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv',index_col=0)

# Column groups:
f1col = [x for x in data.columns if x.startswith('F_1')]
f2col = [x for x in data.columns if x.startswith('F_2')]
f3col = [x for x in data.columns if x.startswith('F_3')]
f4col = [x for x in data.columns if x.startswith('F_4')]

# Mean imputation for F1 and F3 columns:
data[f1col+f3col] = data[f1col+f3col].fillna(data[f1col+f3col].mean())

# Minus one imputation of F4 columns in the testing set:
data_test = data.copy()
data_test[f4col] = data_test[f4col].fillna(-1)

# Row-wise NaN counts of F4 columns:
data[f4col].isna().sum(axis=1).value_counts()

In [None]:
def get_model(hidden_size=256):
    
    # Input:
    inputF4 = L.Input(shape=len(f4col)-1)
    
    # Network:
    output = WeightNormalization(L.Dense(units=hidden_size*4,activation=mish))(inputF4)
    output = L.BatchNormalization()(output)
    output = WeightNormalization(L.Dense(units=hidden_size*4,activation=mish))(output)
    output = WeightNormalization(L.Dense(units=hidden_size*2,activation=mish))(output)
    output = WeightNormalization(L.Dense(units=hidden_size,activation=mish))(output)
    output = L.Dense(units=1, activation='linear')(output)
    
    # Output:
    model = tf.keras.Model(inputF4, output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    return(model)

class CustomDataset(tf.keras.utils.Sequence):
    
    def __init__(self, X, y, noise, batch_size=256, random_state=42):
        '''initialize dataset and noise positions'''
        np.random.seed(random_state)
        tf.random.set_seed(random_state)
        self.X, self.y, self.noise = X, y, noise
        self.batch_size = batch_size
        self.on_epoch_end()
        
    def __len__(self):
        '''number of batches per epoch'''
        return self.X.shape[0] // self.batch_size
    
    def __getitem__(self, index):
        '''generate batch and Randomly add -1 to the training and validation sets
        in the same amount as expected in the testing set'''
        batch_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        batch_noises = self.noiseindexes[index*self.batch_size:(index+1)*self.batch_size]
        X_batch, y_batch = self.X[batch_indexes], self.y[batch_indexes]
        for i in range(self.noise.shape[1]):
            X_batch[range(self.batch_size),self.noise[batch_noises,i]] = -1  
        return X_batch, y_batch
    
    def on_epoch_end(self):
        '''shuffle the training set as well as the -1 positions'''
        self.indexes = np.arange(len(self.X))
        self.noiseindexes = np.arange(len(self.noise))
        np.random.shuffle(self.indexes)
        np.random.shuffle(self.noiseindexes)

Change the line below to "True" to create the models instead of loading from the dataset. I created the models in my personal computer and uploaded them afterwards since I almost run out of GPU time.

In [None]:
train = False

In [None]:
result = data.copy()

for cnt in range(5):
    
    # Prepare values which will be set to -1 during training:
    random.seed(cnt)
    np.random.seed(cnt)
    tf.random.set_seed(cnt)
    os.environ['PYTHONHASHSEED'] = str(cnt)
    noise = np.array([np.random.choice(
        len(f4col)-1, size=cnt, replace=False
    ) for _ in range(data.shape[0])])
    
    # Find rows with zero-nan and with "cnt+1" nans:
    rownan0 = data[f4col].isna().sum(axis=1)==0
    rownans = data[f4col].isna().sum(axis=1)==cnt+1
    
    for col in f4col:
        
        # Get testing set from rows with the amount of nan given by the current 'cnt' value:
        colna = data[col].isna()
        X_test = data_test.loc[rownans&colna,f4col].drop(columns=col).values
        
        # If there are NaNs in the testing set:
        if(X_test.shape[0]):
        
            # Get training and validation sets from zero-nan rows:
            X = data.loc[rownan0,f4col]
            y = X.pop(col)
            X_train, X_valid, y_train, y_valid, noise_train, noise_valid = train_test_split(
                X.values, y.values, noise[rownan0], train_size=0.8, random_state=cnt
            )

            # Training:
            model = get_model()
            if train:
                history = model.fit(
                    CustomDataset(X_train, y_train, noise_train), 
                    validation_data = CustomDataset(X_valid, y_valid, noise_valid),
                    epochs=300, verbose=0, #use_multiprocessing=True, workers=4, 
                    callbacks=[
                        ReduceLROnPlateau(monitor='val_loss',mode='min',
                            verbose=0,factor=0.5,patience=3),
                        EarlyStopping(mode='min',restore_best_weights=True,
                            verbose=0,min_delta=1e-4,patience=10),
                        ModelCheckpoint(f'model_{cnt}_{col}.hdf5',monitor='val_loss',mode='min',
                            verbose=0,save_best_only=True,save_weights_only=True),
                    ]
                )
                model.load_weights(f'model_{cnt}_{col}.hdf5')
            else:
                model.load_weights(f'../input/tps22juntf/model_{cnt}_{col}.hdf5')

            # Performance:
            for i in range(cnt):
                X_valid[range(len(X_valid)),noise_valid[:,i]] = -1  
            y_pred = model.predict(X_valid)
            print(f'FFNN: cnt={cnt+1}, col={col}, RMSE={mean_squared_error(y_valid,y_pred,squared=False)}')        

            # Inference:
            result.loc[rownans&colna,col] = model.predict(X_test)
            K.clear_session()

In [None]:
# Submission:
for i in sample_submission.index:
    row, col = i.split('-')
    sample_submission.loc[i,'value'] = result.loc[int(row),col]
    
sample_submission.to_csv('submission.csv')
sample_submission.head()

Thank you for reading! Please let me know if you have any questions or suggestions.