In [None]:
import pandas as pd
import numpy as np
import scipy.stats as aa
import seaborn as sns
import missingno as mn
import tensorflow as tf
import tensorflow_addons as tfa

from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

from tensorflow.keras.layers import Dense, Input, concatenate, Embedding, LSTM
from tensorflow.keras.layers import BatchNormalization, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mse
from tensorflow.keras.activations import relu
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

## Load data

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
data

## change data types

In [None]:
np.asarray(data.dtypes)

In [None]:
cont_cols = data.select_dtypes('float64').columns
dis_cols = data.select_dtypes('int64').columns

In [None]:
data[cont_cols] = data[cont_cols].astype('float32')
np.asarray(data[cont_cols].dtypes)

In [None]:
data[dis_cols] = data[dis_cols].astype('int32')
np.asarray(data[dis_cols].dtypes)

## How many missing values in each column

In [None]:
plt.figure(figsize=[24,8], dpi=200)
mn.matrix(data, labels=list(data.columns))
plt.show()

In [None]:
nan_indexes = {}

for col in data.columns:
    nan_indexes[col] = data.loc[data[col].isna(), col].index

In [None]:
data_without_nan = data.dropna().index
data_without_nan

In [None]:
only_one_nan = data[data.isna().sum(axis=1)==1].index
only_one_nan

In [None]:
data.columns[data.isna().sum()>0]

In [None]:
data.loc[:, dis_cols]

In [None]:
plt.figure(figsize=[24,6], dpi=300)
sns.barplot(x=list(data.columns), y=[len(nan_indexes[col]) for col in nan_indexes], color='#0D0D0D')
plt.xticks(rotation=90)
plt.xlabel('column')
plt.ylabel('missing count')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=10, ncols=8, figsize=[32,36], dpi=150)
axes = axes.ravel()
i=0

for col, missing_id in nan_indexes.items():
    sns.stripplot(x=np.log(missing_id + 1), linewidth=1, ax=axes[i])    
    axes[i].set_xlabel('column position')
    axes[i].set_ylabel(col)    
    i+=1
    
plt.show()

In [None]:
plt.figure(figsize=[12,6], dpi=150)
jitter = np.random.uniform(0,.1,data.shape[0])

sns.stripplot(x=np.sum(data.isna(), axis=1).values + jitter, linewidth=1)    
plt.xlabel('missing counts')
plt.ylabel('rows')
    
plt.show()

In [None]:
plt.figure(figsize=[28,28], dpi=300)
sns.heatmap(data.corr(), cbar=False, cmap='RdGy', fmt='.1f', square=True, annot=True)
plt.show()

In [None]:
plt.figure(figsize=[24,24], dpi=300)
sns.heatmap(data[data.columns[data.isna().sum()>0]].corr(), cbar=False, cmap='RdGy', fmt='.1f', square=True, annot=True)
plt.show()

In [None]:
def simple_model(shape=79):
    nn = Sequential([Input(shape=shape),
                     Dense(256, activation='swish'),
                     BatchNormalization(),
                     Dense(128, activation='swish'),
                     BatchNormalization(),
                     Dense(64, activation='swish'),
                     BatchNormalization(),
                     Dense(32, activation='swish'),
                     BatchNormalization(),
                     Dense(1)])

    nn.compile(optimizer=Adam(),
               loss=mse,
               metrics=[RootMeanSquaredError()])
    
    return nn

In [None]:
def complex_model(shape_cont=54, shape_cat=25):
    input_cont = Input(shape=shape_cont)
    input_dis = Input(shape=shape_cat)
    
    x = Dense(512, activation='swish')(input_cont)
    x = BatchNormalization()(x)
        
    cat_hids = [x]
    
    for i in range(1, 3):
        cat_hids.append(LSTM(32*i)(Embedding(25, 16*i)(input_dis)))
    
    merger = Concatenate()(cat_hids)
    
    x = Dense(256, activation='swish')(merger)
    x = BatchNormalization()(x)
    merger_1 = Dense(128, activation='swish')(Concatenate()([merger, x]))
    x = BatchNormalization()(merger_1)
    x = Dense(64, activation='swish')(Concatenate()([merger, merger_1, x]))
    x = BatchNormalization()(x)
    x = Dense(1)(x)
    
    nn = Model([input_cont,input_dis], x)

    nn.compile(optimizer=Adam(learning_rate=0.01),
               loss=mse,
               metrics=[RootMeanSquaredError()])
    
    return nn

In [None]:
tf.keras.utils.plot_model(to_file='semple-model.png', simple_model(79), rankdir='TB')

In [None]:
tf.keras.utils.plot_model(to_file='complex-model.png', complex_model(54,25), rankdir='TB')

In [None]:
reduce_lr = ReduceLROnPlateau(monitor = "val_loss", 
                              factor = 0.1, 
                              patience = 4, 
                              verbose = 1)

early_stop = EarlyStopping(monitor = "val_loss", 
                           patience = 10, 
                           verbose = 1, 
                           restore_best_weights = True)

## imputation 1 : fill only rows with one missing value

In [None]:
nan_cols = data.drop(data.loc[:, 'F_2_0':'F_2_24'].columns, axis=1).columns
nan_cols

In [None]:
data_without_nas = data.iloc[np.asarray([*only_one_nan, *data_without_nan])]
data_without_nas

In [None]:
one_nan = {}

for col in data.columns:
    one_nan[col] = data_without_nas.loc[data_without_nas[col].isna(), col].index

In [None]:
data_without_nas = data_without_nas.fillna(value=0.0)

In [None]:
data_without_nas.isna().sum()

In [None]:
pd.options.display.max_rows = 80
data_without_nas.describe().T

In [None]:
for i, target in enumerate(nan_cols):
    print('-'*50)
    print(f'{target} -- {i+1}/{len(nan_cols)} -- train done -- imputation', end='\r')
    print()

    model_ = simple_model(79)
    
    model_.fit(x=data_without_nas.drop(target, axis=1).values, 
               y=data_without_nas[target].values,
               epochs=50,
               batch_size=4096,
               validation_split=0.2,
               callbacks=[early_stop, reduce_lr])

    data_without_nas.loc[one_nan[target], target] = model_.predict_on_batch(data_without_nas.drop(target, axis=1).loc[one_nan[target]])
    
    print('imputed')

In [None]:
data_without_nas

In [None]:
data_without_nas.to_csv('only-one-na-in-row.csv')

In [None]:
data_without_nas.describe().T

## impute into rows has less than three missing values

In [None]:
more_nan = data.drop(data_without_nas.index)[data.drop(data_without_nas.index).isna().sum(axis=1)<=3].index
more_nan

In [None]:
data_without_nas = data.iloc[np.asarray([*data_without_nas.index,*more_nan])]
data_without_nas

In [None]:
more_nan_id = {}

for col in data.columns:
    more_nan_id[col] = data_without_nas.loc[data_without_nas[col].isna(), col].index

In [None]:
data_without_nas = data_without_nas.fillna(value=0.0)

In [None]:
data_without_nas.isna().sum()

In [None]:
for i, target in enumerate(nan_cols):
    print('-'*50)
    print(f'{target} -- {i+1}/{len(nan_cols)} -- train done -- imputation', end='\r')
    print()

    model_ = simple_model(79)
    
    model_.fit(x=data_without_nas.drop(target, axis=1).values, 
               y=data_without_nas[target].values,
               epochs=50,
               batch_size=4096,
               validation_split=0.2,
               callbacks=[early_stop, reduce_lr])

    data_without_nas.loc[more_nan_id[target], target] = model_.predict_on_batch(data_without_nas.drop(target, axis=1).loc[more_nan_id[target]])
    
    print('imputed')

In [None]:
data_without_nas

In [None]:
data_without_nas.to_csv('less-than-3-na-in-row.csv')

## impute rows less than six missing values

In [None]:
more_nan_1 = data.drop(data_without_nas.index)[data.drop(data_without_nas.index).isna().sum(axis=1)<=6].index
more_nan_1

In [None]:
data_without_nas = data.iloc[np.asarray([*data_without_nas.index,*more_nan_1])]
data_without_nas

In [None]:
more_nan_1_id = {}

for col in data.columns:
    more_nan_1_id[col] = data_without_nas.loc[data_without_nas[col].isna(), col].index

In [None]:
data_without_nas = data_without_nas.fillna(value=0.0)

In [None]:
data_without_nas.isna().sum()

In [None]:
for i, target in enumerate(nan_cols):
    print('-'*50)
    print(f'{target} -- {i+1}/{len(nan_cols)} -- train done -- imputation', end='\r')
    print()
    
    model_ = complex_model(54,25)
    model_.fit(x=[data_without_nas[cont_cols].drop(target, axis=1).values, data_without_nas[dis_cols].values],
               y=data_without_nas[target].values,
               epochs=50,
               batch_size=4096,
               validation_split=0.2,
               callbacks=[early_stop, reduce_lr])

    data_without_nas.loc[more_nan_1_id[target], target] += model_.predict_on_batch([data_without_nas[cont_cols].drop(target, axis=1).loc[more_nan_1_id[target]],
                                                                                    data_without_nas[dis_cols].loc[more_nan_1_id[target]]]).reshape(1, -1)[0]
    
    print('imputed')

In [None]:
data_without_nas

In [None]:
data_without_nas.to_csv('less-than-6-na-in-row.csv')

## impute rest of rows with missing values

In [None]:
all_nan = data.drop(data_without_nas.index)[data.drop(data_without_nas.index).isna()].index
all_nan

In [None]:
data_without_nas = data.iloc[np.asarray([*data_without_nas.index,*all_nan])]
data_without_nas

In [None]:
all_nan_id = {}

for col in data.columns:
    all_nan_id[col] = data_without_nas.loc[data_without_nas[col].isna(), col].index

In [None]:
data_without_nas = data_without_nas.fillna(value=0.0)

In [None]:
data_without_nas.isna().sum()

In [None]:
for i, target in enumerate(nan_cols):
    print('-'*50)
    print(f'{target} -- {i+1}/{len(nan_cols)} -- train done -- imputation', end='\r')
    print()
    
    model_ = complex_model(54,25)
    model_.fit(x=[data_without_nas[cont_cols].drop(target, axis=1).values, data_without_nas[dis_cols].values],
               y=data_without_nas[target].values,
               epochs=50,
               batch_size=4096,
               validation_split=0.2,
               callbacks=[early_stop, reduce_lr])

    data_without_nas.loc[all_nan_id[target], target] += model_.predict_on_batch([data_without_nas[cont_cols].drop(target, axis=1).iloc[all_nan_id[target]],
                                                                                 data_without_nas[dis_cols].iloc[all_nan_id[target]]]).reshape(1, -1)[0]
    
    print('imputed')

In [None]:
data_without_nas

In [None]:
data_without_nas.to_csv('rest-of-all-na-in-row.csv')

## submission file

In [None]:
imputed_values = []

for key in nan_indexes.keys():
    for index in nan_indexes[key]:
        imputed_values.append([f'{index}-{key}', data_without_nas.loc[index, key]])

In [None]:
imputed_values = pd.DataFrame(imputed_values, columns=['row-col', 'value']).sort_values(by='row-col')
imputed_values

In [None]:
imputed_values.isna().sum()

In [None]:
imputed_values.to_csv('submission_14.csv', index=False)