# Importing Libraries and Loading datasets

In [None]:
import os
import random
import numpy as np
import pandas as pd

import tensorflow as tf
tf.config.threading.set_intra_op_parallelism_threads(6)
tf.config.threading.set_inter_op_parallelism_threads(2)

from tensorflow import keras
from tensorflow.keras import layers, callbacks

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv")
submission = pd.read_csv("../input/tabular-playground-series-jun-2022/sample_submission.csv")

# Explore Data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
print("Columns: \n{0}".format(list(data.columns)))

# Basic Data Check

In [None]:
print('Data shape:', data.shape)

## Missing values

In [None]:
missing_values = data.isna().any().sum()
print('Missing values in data: {0}'.format(missing_values[missing_values > 0]))

## Duplicates

In [None]:
duplicates = data.duplicated().sum()
print('Duplicates in data: {0}'.format(duplicates))

# Modelling

In [None]:
N_SPLITS = 3
EPOCHS = 30
BATCH_SIZE = 2048
ACTIVATION = 'swish'

my_seed = 1
def seedAll(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seedAll(my_seed)

def load_model(X):
    early_stopping = callbacks.EarlyStopping(
        monitor="val_loss",     # Quantity to be monitored
        patience=20,                # How many epochs to wait before stopping
        restore_best_weights=True)
    
    reduce_lr = callbacks.ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.5,                # Factor by which the learning rate will be reduced
        patience=5)                # Number of epochs with no improvement
    
    model = keras.Sequential([
        layers.Dense(256, activation=ACTIVATION, input_shape=[X.shape[1]]),
        layers.Dense(128, activation=ACTIVATION),
        layers.Dense(64, activation=ACTIVATION),
        layers.Dense(32, activation=ACTIVATION),
        layers.Dense(1, activation='linear')
    ])
    
    model.compile(
        optimizer='sgd',
        loss='mse',
        metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model, [early_stopping, reduce_lr]

def run_model(X, X_test, y):
    test_predictions = []
    cv = KFold(n_splits=N_SPLITS, random_state=my_seed, shuffle=True)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        train_X, val_X = X.iloc[train_idx], X.iloc[test_idx]
        train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]
        
        model, CALLBACKS = load_model(X)
        history = model.fit(
            train_X, train_y,
            validation_data=(val_X, val_y),
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            callbacks=CALLBACKS,        # Put your callbacks in a list
            verbose=0)                  # Turn off training log
        
        test_predictions.append(model.predict(X_test))
    return np.mean(test_predictions, axis=0)

def arrange(y_col):
    X_test = data.loc[data[y_col].isna() == True]
    
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    df = pd.DataFrame(imputer.fit_transform(data))
    df.columns = data.columns
    df.index = data.index
    
    X = df.drop([y_col, "row_id"], axis=1, errors='ignore')
    
    X_test = df.loc[X_test.index]
    X_test = X_test.drop([y_col, "row_id"], axis=1, errors='ignore')
    
    y = df.loc[X.index][y_col]
    X = X.loc[y.index]
    
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    return X, X_test, y

In [None]:
# Credits to https://www.kaggle.com/code/cabaxiom/tps-jun-22-lightgbm-just-regression/notebook
preds = []
for y_col in data.columns:
    if (y_col[2] == "1") or (y_col[2] == "3") or (y_col[2] == "4"):
        #print(y_col)
        X, X_test, y = arrange(y_col)
        
        if (y_col[2] == "1") or (y_col[2] == "3"):
            preds = np.full(len(X_test), y.mean())
        
        if y_col[2] == '4':
            x_col = [i for i in data.columns if "F_4" in i]
            x_col.remove(y_col)
            preds = run_model(X[x_col], X_test[x_col], y)
        
        #print("Shape: {0}, preds: {1}".format(preds.shape, preds))
        submission.loc[submission["row-col"].str.endswith(y_col), "value"] = preds

# Submission

In [None]:
submission.to_csv('submission.csv', index=False)
submission