# Introduction
MLP starter with a bpp(s) feature and a custom loss. Formatting data for a tabular-like modeling task is time-consuming, so it had been already done in my previous kernel (https://www.kaggle.com/code1110/openvaccine-is-bpp-s-the-most-important). 

# Libraries

In [None]:
import gc
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd
import seaborn as sns
import itertools
from pathlib import Path
from tqdm import tqdm

from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score

# model
import lightgbm as lgb

# keras
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras import layers
import tensorflow.keras.backend as kb

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')

import warnings
warnings.filterwarnings('ignore')

# CONFIG

In [None]:
SEED = 42
NFOLD = 7
SCALER = 'MinMax' # Standard

# Load premade data

In [None]:
train_data = pd.read_feather('../input/openvaccine-is-bpp-s-the-most-important/train_fe.feather')
test_data = pd.read_feather('../input/openvaccine-is-bpp-s-the-most-important/test_fe.feather')

In [None]:
print(train_data.shape)
train_data.head()

In [None]:
print(test_data.shape)
test_data.head()

In [None]:
submission = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')
print(submission.shape)
submission.head()

# Modeling pipeline

In [None]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [None]:
def rmse(y_actual, y_pred):
    return np.sqrt(mean_squared_error(y_actual, y_pred))

def mcrmse(y_actual, y_pred, num_scored=len(target_cols)):
    score = 0
    for i in range(num_scored):
        score += rmse(y_actual[:, i], y_pred[:, i]) / num_scored
        
    return score

# def MCRMSE(y_true, y_pred):
#     print(y_true.shape)
#     print(y_pred.shape)
#     colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
#     return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def RMSE(y_actual, y_pred, eps=1e-6):
    mse = tf.keras.losses.mean_squared_error(y_actual, y_pred)
    return kb.sqrt(mse + eps)

def MCRMSE(y_actual, y_pred, num_scored=len(target_cols)):
    score = 0
    for i in range(num_scored):
        score += RMSE(y_actual[:, i], y_pred[:, i]) / num_scored
        
    return score

In [None]:
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

# ---- StratifiedGroupKFold ----
class StratifiedGroupKFold(object):
    """
    StratifiedGroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        groups = X[group].values
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(self.n_splits)])
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.random_state).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.n_splits):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.n_splits):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_idx = [i for i, g in enumerate(groups) if g in train_groups]
            test_idx = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_idx, test_idx

In [None]:
def get_oof_ypred(model, x_val, x_test, modelname="nn", task="regression"):  
    """
    get oof and target predictions
    """
    sklearns = ["xgb", "catb", "linear", "knn"]
    if task == "multiclass":
        sklearns.append("lgb")

    if task == "binary": # classification
        # sklearn API
        if modelname in sklearns:
            oof_pred = model.predict_proba(x_val)
            y_pred = model.predict_proba(x_test)
            oof_pred = oof_pred[:, 1]
            y_pred = y_pred[:, 1]
        else:
            oof_pred = model.predict(x_val)
            y_pred = model.predict(x_test)

            # NN specific
            if modelname == "nn":
                oof_pred = oof_pred.ravel()
                y_pred = y_pred.ravel()        

    elif task == "multiclass":
        # sklearn API
        if modelname in sklearns:
            oof_pred = model.predict_proba(x_val)
            y_pred = model.predict_proba(x_test)
        else:
            oof_pred = model.predict(x_val)
            y_pred = model.predict(x_test)

        # oof_pred = np.argmax(oof_pred, axis=1)
        # y_pred = np.argmax(y_pred, axis=1)

    elif task == "regression": # regression
        oof_pred = model.predict(x_val)
        y_pred = model.predict(x_test)

        # NN specific
        if modelname == "nn":
            oof_pred = oof_pred.ravel()
            y_pred = y_pred.ravel()
            
    elif task == 'custom':
        oof_pred = model.predict(x_val)
        y_pred = model.predict(x_test)

    return oof_pred, y_pred

In [None]:
import math
import random
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict

def seed_everything(seed : int) -> NoReturn :    
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything(SEED)

def nn_model(cls, train_set, val_set):
    """
    NN hyperparameters and models
    """

    # set seed for tf
    seed_everything(cls.seed)

    # adapted from https://github.com/ghmagazine/kagglebook/blob/master/ch06/ch06-03-hopt_nn.py
    if not cls.params:
        params = {
            'input_dropout': 0.0,
            'hidden_layers': 3,
            'hidden_units': 256,
            'embedding_out_dim': 4,
            'hidden_activation': 'relu', 
            'hidden_dropout': 0.04,
            'gauss_noise': 0.01,
            'norm_type': 'batch', # layer
            'optimizer': {'type': 'adam', 'lr': 1e-3},
            'batch_size': 256,
            'epochs': 80
        }
        cls.params = params

    # NN model architecture
    inputs = []
    n_neuron = cls.params['hidden_units']

    # embedding for categorical features 
    if len(cls.categoricals) > 0:
        embeddings = []
        embedding_out_dim = cls.params['embedding_out_dim']
        for i in cls.categoricals:
            input_ = layers.Input(shape=(1,))
            embedding = layers.Embedding(int(np.absolute(cls.train_df[i]).max() + 1), embedding_out_dim, input_length=1)(input_)
            embedding = layers.Reshape(target_shape=(embedding_out_dim,))(embedding)
            inputs.append(input_)
            embeddings.append(embedding)
        input_numeric = layers.Input(shape=(len(cls.features) - len(cls.categoricals),))
        embedding_numeric = layers.Dense(n_neuron, activation=cls.params['hidden_activation'])(input_numeric)
        inputs.append(input_numeric)
        embeddings.append(embedding_numeric)
        x = layers.Concatenate()(embeddings)

    else: # no categorical features
        inputs = layers.Input(shape=(len(cls.features), ))
        x = layers.Dense(n_neuron, activation=cls.params['hidden_activation'])(inputs)
        x = layers.Dropout(cls.params['hidden_dropout'])(x)
        x = layers.GaussianNoise(cls.params['gauss_noise'])(x)
        if cls.params['norm_type'] == 'batch':
            x = layers.BatchNormalization()(x)
        elif cls.params['norm_type'] == 'layer':
            x = layers.LayerNormalization()(x)
        
    # more layers
    for i in np.arange(cls.params['hidden_layers'] - 1):
        x = layers.Dense(n_neuron // (2 * (i+1)), activation=cls.params['hidden_activation'])(x)
        x = layers.Dropout(cls.params['hidden_dropout'])(x)
        x = layers.GaussianNoise(cls.params['gauss_noise'])(x)
        if cls.params['norm_type'] == 'batch':
            x = layers.BatchNormalization()(x)
        elif cls.params['norm_type'] == 'layer':
            x = layers.LayerNormalization()(x)
    
    # output
    if cls.task == "regression":
        out = layers.Dense(1, activation="linear", name = "out")(x)
        loss = "mse"
    elif cls.task == "binary":
        out = layers.Dense(1, activation='sigmoid', name = 'out')(x)
        loss = "binary_crossentropy"
    elif cls.task == "multiclass":
        out = layers.Dense(len(np.unique(cls.train_df[cls.target].values)), activation='softmax', name = 'out')(x)
        loss = "categorical_crossentropy"
    elif cls.task == "custom":
        out = layers.Dense(len(target_cols), activation='linear', name = 'out')(x)
        loss = MCRMSE
            
    model = models.Model(inputs=inputs, outputs=out)

    # compile
    if cls.params['optimizer']['type'] == 'adam':
        model.compile(loss=loss, optimizer=optimizers.Adam(lr=cls.params['optimizer']['lr'], beta_1=0.9, beta_2=0.999, decay=cls.params['optimizer']['lr']/100))
    elif cls.params['optimizer']['type'] == 'sgd':
        model.compile(loss=loss, optimizer=optimizers.SGD(lr=cls.params['optimizer']['lr'], decay=1e-6, momentum=0.9))

    # callbacks
    early_stop = callbacks.EarlyStopping(patience=8, min_delta=cls.params['optimizer']['lr'], restore_best_weights=True, monitor='val_loss')
    lr_schedule = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=1, epsilon=cls.params['optimizer']['lr'], mode='min')
    history = model.fit(train_set['X'], train_set['y'], callbacks=[early_stop, lr_schedule],
                        epochs=cls.params['epochs'], batch_size=cls.params['batch_size'],
                        validation_data=(val_set['X'], val_set['y']))        
        
    fi = np.zeros(len(cls.features)) # no feature importance computed

    return model, fi

In [None]:
class RunModel(object):
    """
    Model Fitting and Prediction Class:
    :INPUTS:
    :train_df: train pandas dataframe
    :test_df: test pandas dataframe
    :target: target column name (str)
    :features: list of feature names
    :categoricals: list of categorical feature names. Note that categoricals need to be in 'features'
    :model: 'lgb', 'xgb', 'catb', 'linear', or 'nn'
    :params: dictionary of hyperparameters. If empty dict {} is given, default hyperparams are used
    :task: 'regression', 'multiclass', or 'binary'
    :n_splits: K in KFold (default is 4)
    :cv_method: 'KFold', 'StratifiedKFold', 'TimeSeriesSplit', 'GroupKFold', 'StratifiedGroupKFold'
    :group: group feature name when GroupKFold or StratifiedGroupKFold are used
    :target_encoding: True or False
    :seed: seed (int)
    :scaler: None, 'MinMax', 'Standard'
    :verbose: bool
    :EXAMPLE:
    # fit LGB regression model
    model = RunModel(train_df, test_df, target, features, categoricals=categoricals,
            model="lgb", params={}, task="regression", n_splits=4, cv_method="KFold", 
            group=None, target_encoding=False, seed=1220, scaler=None)
    
    # save predictions on train, test data
    np.save("y_pred", model.y_pred)
    np.save("oof", model.oof)
    """

    def __init__(self, train_df : pd.DataFrame, test_df : pd.DataFrame, target : str, features : List, categoricals: List=[],
                model : str="lgb", params : Dict={}, task : str="regression", n_splits : int=4, cv_method : str="KFold", 
                group : str=None, target_encoding=False, seed : int=1220, scaler : str=None, verbose=True):

        # display info
        print("##############################")
        print(f"Starting training model {model} for a {task} task:")
        print(f"- train records: {len(train_df)}, test records: {len(test_df)}")
        print(f"- target column is {target}")
        print(f"- {len(features)} features with {len(categoricals)} categorical features")
        if target_encoding:
            print(f"- target encoding: Applied")
        else:
            print(f"- target encoding: NOT Applied")
        print(f"- CV strategy : {cv_method} with {n_splits} splits")
        if group is None:
            print(f"- no group parameter is used for validation")
        else:
            print(f"- {group} as group parameter")
        if scaler is None:
            print("- No scaler is used")
        else:
            print(f"- {scaler} scaler is used")
        print("##############################")

        # class initializing setups
        self.train_df = train_df
        self.test_df = test_df
        self.target = target
        self.features = features
        self.categoricals = categoricals
        self.model = model
        self.params = params
        self.task = task
        self.n_splits = n_splits
        self.cv_method = cv_method
        self.group = group
        self.target_encoding = target_encoding
        self.seed = seed
        self.scaler = scaler
        self.verbose = verbose
        self.y_pred, self.score, self.model, self.oof, self.y_val, self.fi_df = self.fit()

    def train_model(self, train_set, val_set):
        """
        employ a model
        """
        # compile model
        if self.model == "lgb": # LGB             
            model, fi = lgb_model(self, train_set, val_set)

        elif self.model == "xgb": # xgb
            model, fi = xgb_model(self, train_set, val_set)

        elif self.model == "catb": # catboost
            model, fi = catb_model(self, train_set, val_set)

        elif self.model == "linear": # linear model
            model, fi = lin_model(self, train_set, val_set)

        elif self.model == "nn": # neural network
            model, fi = nn_model(self, train_set, val_set)
        
        return model, fi # fitted model and feature importance

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        """
        dataset converter
        """
        if (self.model == "lgb") & (self.task != "multiclass"):
            train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
            val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
            
        elif (self.model == "nn") & (self.task == "multiclass"):
            ohe = OneHotEncoder(sparse=False, categories='auto')
            train_set = {'X': x_train, 'y': ohe.fit_transform(y_train.values.reshape(-1, 1))}
            val_set = {'X': x_val, 'y': ohe.transform(y_val.values.reshape(-1, 1))}
            
        else:
            train_set = {'X': x_train, 'y': y_train}
            val_set = {'X': x_val, 'y': y_val}
            
        return train_set, val_set

    def calc_metric(self, y_true, y_pred): 
        """
        calculate evaluation metric for each task
        this may need to be changed based on the metric of interest
        """
        if self.task == "multiclass":
            return f1_score(y_true, y_pred, average="macro")
        
        elif self.task == "binary":
            return roc_auc_score(y_true, y_pred) # log_loss
        
        elif self.task == "regression":
            return np.sqrt(mean_squared_error(y_true, y_pred))
        
        elif self.task == "custom":     
            return mcrmse(y_true, y_pred)

    def get_cv(self):
        """
        employ CV strategy
        """

        # return cv.split
        if self.cv_method == "KFold":
            cv = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df)
        
        elif self.cv_method == "StratifiedKFold":
            cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target])
        
        elif self.cv_method == "TimeSeriesSplit":
            cv = TimeSeriesSplit(max_train_size=None, n_splits=self.n_splits)
            return cv.split(self.train_df)
        
        elif self.cv_method == "GroupKFold":
            cv = GroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target], self.group)
        
        elif self.cv_method == "StratifiedGroupKFold":
            cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target], self.group)

    def fit(self):
        """
        perform model fitting        
        """

        # initialize
        y_vals = np.zeros((self.train_df.shape[0], ))
        if self.task  == "multiclass":
            n_class = len(self.target)
            oof_pred = np.zeros((self.train_df.shape[0], n_class))
            y_pred = np.zeros((self.test_df.shape[0], n_class))
            
        elif self.task == 'custom':
            n_class = len(target_cols)
            y_vals = np.zeros((self.train_df.shape[0], n_class))
            oof_pred = np.zeros((self.train_df.shape[0], n_class))
            y_pred = np.zeros((self.test_df.shape[0], n_class))           
                
        else:
            oof_pred = np.zeros((self.train_df.shape[0], ))
            y_pred = np.zeros((self.test_df.shape[0], ))

        # group does not kick in when group k fold is used
        if self.group is not None:
            if self.group in self.features:
                self.features.remove(self.group)
            if self.group in self.categoricals:
                self.categoricals.remove(self.group)
        fi = np.zeros((self.n_splits, len(self.features)))

        # target encoding
        numerical_features = [f for f in self.features if f not in self.categoricals]
        if self.target_encoding:            
            # perform target encoding
            k = 0
            f = 1
            overall_mean = self.train_df[self.target].mean()
            for c in self.categoricals:
                data_tmp = pd.DataFrame({c: self.train_df[c].values, 'target': self.train_df[self.target].values})
                tmp = np.nan * np.ones(self.train_df.shape[0])
                
                cv = self.get_cv()
                for fold, (train_idx, val_idx) in enumerate(cv):
                    # target mean
                    target_mean = data_tmp.iloc[train_idx].groupby(c)['target'].mean().reset_index() 
                    
                    # smoothing
                    target_count = data_tmp.iloc[train_idx].groupby(c)['target'].count().reset_index() 
                    target_count['target'] = target_count['target'].apply(lambda x : 1 / (1 + np.exp((-x-k) / f)))
                    target_mean['target'] = target_mean['target'] * target_count['target'] + (1 - target_count['target']) * overall_mean

                    # allocate
                    tmp[val_idx] = self.train_df[c].iloc[val_idx].map(target_mean.to_dict()).values
                self.train_df[c] = tmp
                
                # replace categorical variable in test
                target_mean = data_tmp.groupby(c)['target'].mean()
                self.test_df.loc[:, c] = self.test_df[c].map(target_mean).values
            
            # no categoricals any more
            numerical_features = self.features.copy()
            self.categoricals = []
        
        # fill nan
        if self.model not in ['lgb', 'catb', 'xgb']:
            # fill NaN (numerical features -> median, categorical features -> mode)
            self.train_df[numerical_features] = self.train_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.test_df[numerical_features] = self.test_df[numerical_features].replace([np.inf, -np.inf], np.nan)
            self.train_df[numerical_features] = self.train_df[numerical_features].fillna(self.train_df[numerical_features].median())
            self.test_df[numerical_features] = self.test_df[numerical_features].fillna(self.test_df[numerical_features].median())
            self.train_df[self.categoricals] = self.train_df[self.categoricals].fillna(self.train_df[self.categoricals].mode().iloc[0])
            self.test_df[self.categoricals] = self.test_df[self.categoricals].fillna(self.test_df[self.categoricals].mode().iloc[0])
      
        # scaling, if necessary
        if self.scaler is not None:
            # to normal
            pt = QuantileTransformer(n_quantiles=100, random_state=self.seed, output_distribution="normal")
            self.train_df[numerical_features] = pt.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = pt.transform(self.test_df[numerical_features])

            # starndardize
            if self.scaler == "MinMax":
                scaler = MinMaxScaler()
            elif self.scaler == "Standard":
                scaler = StandardScaler()
            self.train_df[numerical_features] = scaler.fit_transform(self.train_df[numerical_features])
            self.test_df[numerical_features] = scaler.transform(self.test_df[numerical_features])

            x_test = self.test_df.copy()
            if self.model == "nn":
                x_test = [np.absolute(x_test[i]) for i in self.categoricals] + [x_test[numerical_features]]
            else:
                x_test = x_test[self.features]
        else:
            x_test = self.test_df[self.features]
        
        # fitting with out of fold
        cv = self.get_cv()
        for fold, (train_idx, val_idx) in enumerate(cv):
            # train test split
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target].iloc[train_idx], self.train_df[self.target].iloc[val_idx]

            if self.model == "nn":
                x_train = [np.absolute(x_train[i]) for i in self.categoricals] + [x_train[numerical_features]]
                x_val = [np.absolute(x_val[i]) for i in self.categoricals] + [x_val[numerical_features]]

            # model fitting
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model, importance = self.train_model(train_set, val_set)
            fi[fold, :] = importance
            y_vals[val_idx] = y_val

            # predictions and check cv score
            oofs, ypred = get_oof_ypred(model, x_val, x_test, self.model, self.task)
            y_pred += ypred.reshape(y_pred.shape) / self.n_splits
            if self.task == "multiclass":
                oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    np.argmax(oof_pred[val_idx, :], axis=1))))
            elif self.task == 'custom':
                oof_pred[val_idx, :] = oofs.reshape(oof_pred[val_idx, :].shape)   
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx, :], 
                    oof_pred[val_idx, :])))
            else:
                oof_pred[val_idx] = oofs.reshape(oof_pred[val_idx].shape)
                print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_vals[val_idx], 
                    oof_pred[val_idx])))

        # feature importance data frame
        fi_df = pd.DataFrame()
        for n in np.arange(self.n_splits):
            tmp = pd.DataFrame()
            tmp["features"] = self.features
            tmp["importance"] = fi[n, :]
            tmp["fold"] = n
            fi_df = pd.concat([fi_df, tmp], ignore_index=True)
        gfi = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index()
        fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean'))

        # outputs
        if self.task == "multiclass":
            loss_score = self.calc_metric(y_vals, np.argmax(oof_pred, axis=1))
        else:
            loss_score = self.calc_metric(y_vals, oof_pred)

        if self.verbose:
            print('Our oof loss score is: ', loss_score)
        return y_pred, loss_score, model, oof_pred, y_vals, fi_df

    def plot_feature_importance(self, rank_range=[1, 50]):
        """
        function for plotting feature importance (nothing is returned when the model is NN)
        :EXAMPLE:
        # fit LGB regression model
        model = RunModel(train_df, test_df, target, features, categoricals=categoricals,
                model="lgb", task="regression", n_splits=4, cv_method="KFold", 
                group=None, seed=1220, scaler=None)
        
        # plot 
        fi_df = model.plot_feature_importance(rank_range=[1, 100])
        
        """
        # plot feature importance
        _, ax = plt.subplots(1, 1, figsize=(10, 20))
        sorted_df = self.fi_df.sort_values(by = "importance_mean", ascending=False).reset_index().iloc[self.n_splits * (rank_range[0]-1) : self.n_splits * rank_range[1]]
        sns.barplot(data=sorted_df, x ="importance", y ="features", orient='h')
        ax.set_xlabel("feature importance")
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        return sorted_df

# Fitting

In [None]:
not_use_cols = ['id', 'id_seqpos']
features = [c for c in test_data.columns if c not in not_use_cols]
print(features)

In [None]:
categoricals = [f for f in test_data.columns if ('sequence' in f) | ('loop_type' in f)]

In [None]:
mymodel = RunModel(train_data, test_data, target_cols, features, categoricals=categoricals,
            model="nn", params={}, task="custom", n_splits=NFOLD, cv_method="GroupKFold", 
            group='id', target_encoding=False, seed=SEED, scaler=SCALER)

# OOF, Submit files

In [None]:
oof_df = pd.DataFrame(train_data.id_seqpos)
for i, target in enumerate(target_cols):
    oof_df[target] = mymodel.oof[:, i]
    submission[target] = mymodel.y_pred[:, i]

In [None]:
oof_df.to_csv('oof_df.csv', index=False)
submission.to_csv('submission.csv', index=False)
print('saved!')

The score is also not super bad given that this is a simple LGB model. This approach can be used for ensemble along with other modeling approaches such as RNN (Recurrent Neural Network) and GNN (Graphical Neural Network).