<a id="load-libraries"></a>
# Load the libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import lightgbm as lgbm

import optuna

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

<a id="read-data"></a>
# Read the data

In [None]:
data_df = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')
submission_df = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv')
print(data_df.shape)
data_df.head()

<a id="reduce-memory"></a>
# Reducing the memory usage by data

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

data_df = reduce_memory_usage(data_df, verbose=True)
submission_df = reduce_memory_usage(submission_df, verbose=True)

<a id="ann-modeling"></a>
# Define the ANN model

In [None]:
def ann_model(train_data, test_data):
    
    global submission
    
    X = train_data.drop('row_id', axis=1)
    target = X.pop(column)
    test_data.pop(column)
    row_id = test_data.pop('row_id')
    X = scaler.fit_transform(X)
    X = imputer.fit_transform(X)
    test_data = scaler.transform(test_data)
    test_data = imputer.transform(test_data)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, target)
    
        
    lr_start = 0.01
    lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.7, patience = 4, verbose = 1)
    es = EarlyStopping(monitor = 'val_loss',patience = 12, verbose = 1, mode = 'min', restore_best_weights = True)
    callbacks = [lr, es]
    
    model = keras.Sequential([
        layers.Dense(128, kernel_regularizer=keras.regularizers.l2(40e-6), activation='swish', input_shape=(X.shape[1],)),
        layers.BatchNormalization(axis=1),
        layers.Dense(64, kernel_regularizer=keras.regularizers.l2(40e-6), activation='swish'),
        layers.BatchNormalization(axis=1),
        layers.Dense(32, kernel_regularizer=keras.regularizers.l2(40e-6), activation='swish'),
        layers.BatchNormalization(axis=1),
        layers.Dense(1, activation='linear'),
    ])
    
    optimizer_func = keras.optimizers.Adam(learning_rate = lr_start)
    loss_func = keras.losses.MeanSquaredError()
    
    model.compile(optimizer = optimizer_func, loss = loss_func, metrics=[keras.metrics.RootMeanSquaredError()])
    
    validation_data = (X_valid, y_valid)
    
    model.fit(X_train, 
              y_train, 
              validation_data = validation_data, 
              epochs          = 32,
              verbose         = 2,
              batch_size      = 2048,
              shuffle         = True,
              callbacks       = callbacks
            )
    
    callbacks, es, lr = None, None, None
    
    
    y_val_pred = model.predict(X_valid, batch_size = 2048, verbose = 2)
    score = mean_absolute_error(y_valid, y_val_pred) ** 0.5
    
    best_scores[column] = [score]
    
    test_preds = model.predict(test_data)
    submission = submission.append(pd.DataFrame({'row-col': row_id.astype(str) + '-' + column, 'value': test_preds[:, 0]}))

<a id="lgbm-modeling"></a>
# Define the LGBM model

In [None]:
def lgbm_model(train_data, test_data):
    
    global submission
    
    X = train_data.drop('row_id', axis=1)
    target = X.pop(column)
    test_data.pop(column)
    row_id = test_data.pop('row_id')
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, target)
    
    lgbm_train = lgbm.Dataset(X_train, label=y_train)
    lgbm_eval = lgbm.Dataset(X_valid, y_valid, reference=lgbm_train)
    
    def objective(trial, lgbm_train, lgbm_eval):
    
        params = {
#          "device_type": trial.suggest_categorical("device_type", ['gpu']),
         'boosting_type': trial.suggest_categorical('boosting_type',['gbdt']),
         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
         "num_leaves": trial.suggest_int("num_leaves", 20, 200, step=10),
         "max_depth": trial.suggest_int("max_depth", 3, 10),
         "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 100, log=True),
         "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 100, log=True),
         "bagging_fraction": trial.suggest_float(
             "bagging_fraction", 0.5, 0.95, step=0.05
         ),
         "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
         "feature_fraction": trial.suggest_float(
             "feature_fraction", 0.5, 0.95, step=0.05
         ),

         'task': trial.suggest_categorical('task', ['prediction',]),
         'objective': trial.suggest_categorical('objective', ['regression',]),
         'metric': trial.suggest_categorical('metric', ['rmse',]),
         'verbosity': trial.suggest_categorical('verbosity', [-1]),
             }

        model = lgbm.train(
                        params,
                        lgbm_train,
                        2000,
                        callbacks=[
                                    lgbm.early_stopping(stopping_rounds=10),
                                    lgbm.log_evaluation(2000),
                                   ],
                        valid_sets=[lgbm_eval],
             )

        return model.best_score['valid_0']['rmse']
    
    study = optuna.create_study(direction='minimize', study_name='LGBM')
    func = lambda trial: objective(trial, lgbm_train, lgbm_eval)
    study.optimize(func, n_trials=15)
    
    best_params = study.best_params

    best_model = lgbm.train(
                       best_params,
                       lgbm_train,
                       5000,
                       callbacks=[
                                   lgbm.early_stopping(stopping_rounds=100),
                                   lgbm.log_evaluation(5000),
                                  ],
                       valid_sets=[lgbm_eval],
                      )
    
    best_scores[column] = [best_model.best_score['valid_0']['rmse']]
    
    test_preds = best_model.predict(test_data)
    submission = submission.append(pd.DataFrame({'row-col': row_id.astype(str) + '-' + column,
                                   'value': test_preds}))

<a id="ensemble-modeling"></a>
# Train Ensembles

In [None]:
%%time

miss_cols = data_df.columns[data_df.isnull().any()]
submission = pd.DataFrame(columns=['row-col', 'value'])
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
best_scores   = {}

for column in tqdm(miss_cols):
    
    ann_cols = ['F_4_5', 'F_4_1', 'F_4_7', 'F_4_0', 'F_4_6']
    lgbm_cols = ['F_4_8', 'F_1_13', 'F_4_12', 'F_3_19', 'F_1_7', 'F_3_21', 'F_4_11', 'F_1_12', 'F_4_13', 'F_4_9', 'F_4_3', 'F_4_2', 'F_4_10', 'F_4_14', 'F_4_4']
#     fillna_cols = ['F_3_22', 'F_3_2', 'F_3_20', 'F_3_9', 'F_3_1', 'F_3_15', 'F_3_24', 'F_3_7', 'F_1_1', 'F_1_4', 'F_1_9', 'F_3_12', 'F_1_0', 'F_3_8', 'F_3_13', 'F_1_5', 'F_1_14', 'F_3_0', 'F_1_8', 'F_1_3', 'F_3_5', 'F_3_18', 'F_3_4', 'F_1_2', 'F_3_16', 'F_3_6', 'F_3_11', 'F_3_3', 'F_3_14', 'F_3_17', 'F_1_10', 'F_1_11', 'F_3_10', 'F_1_6', 'F_3_23']
    
    train_data = data_df.loc[(data_df[column].notnull())]
    test_data = data_df.loc[(data_df[column].isnull())]
        
    if column in ann_cols:
        ann_model(train_data, test_data)
        
    elif column in lgbm_cols:
        lgbm_model(train_data, test_data)
        
    else:
        X_train = train_data[column]
        X_valid = test_data[column]
        y_valid = X_train.sample(n=int(X_valid.shape[0]))
        row_id = test_data.pop('row_id')

        X_train_mean = X_train.mean()
        X_valid.fillna(X_train_mean, inplace=True)
        rmse_score = mean_squared_error(y_valid, X_valid)**0.5
        best_scores[column] = [rmse_score]
        submission = submission.append(pd.DataFrame({'row-col': row_id.astype(str) + '-' + column,
                                   'value': X_valid}))

<a id="ploting"></a>
# Plot the results

In [None]:
scores_df = pd.DataFrame(best_scores)
scores_df = scores_df.sort_values(by=0, axis=1)
display(scores_df)

_, ax = plt.subplots(figsize=(20, 8))
sns.barplot(data=scores_df, ax=ax)
plt.xticks(rotation = 90)
plt.show()

<a id="submission"></a>
# Make a submission

In [None]:
submission.to_csv('submission.csv', index=False)
submission