In [1]:
import torch, transformers, os, sys, gc, time, random, warnings, math, json, glob
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import numpy as np
import cuml

from cuml.svm import SVR
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

from torch import Tensor
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore")
%env TOKENIZERS_PARALLELISM=false
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

env: TOKENIZERS_PARALLELISM=false


In [2]:
""" Test Configuration """

# CV Score: 0.4528
class CFG1:
    """ Student Model Fine-Tuned with Mean Pooling, SmoothL1Loss, token_len=1536 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/fbp3-meanpooling-max-length-1536/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss' 
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 1536 # Original: 1536
    batch_size = 8
    n_folds = 5
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
# CV Score: 0.447
class CFG2:
    """ Student Model Fine-Tuned with GEM Pooling, SmoothL1Loss, token_len=1536 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/fbp3-gempooling-max-len-1536-04476/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'GEMPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 1536 # Original: 1536
    batch_size = 8
    n_folds = 5
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.2
        
# CV Score: 0.4545
class CFG3:
    """ Student Model Fine-Tuned with WeightedLayerPooling, SmoothL1Loss, token_len=1024 """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/fbp3-weightedlayerpooling-04545/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'WeightedLayerPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    max_len = 1536 # Original: 1536
    batch_size = 8
    n_folds = 5
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 0.8
    
class CFG4:
    """ Meta Pseudo Student Model with No Fine-Tuned """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/fbp3-meta-pseudo-labels-model/4538_MPL_Student_microsoft-deberta-v3-large_state_dict.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'WeightedLayerPooling'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 32
    n_fold = 5
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 0.6

    
class CFG5:
    """ Fine-Tuned Model from Meta Pseudo Label Student Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = [
        '/kaggle/input/fbp3-meta-pseudo-labels-model/SWA_fold0_MeanPooling_microsoft-deberta-v3-large_state_dict.pth',
        '/kaggle/input/fbp3-meta-pseudo-labels-model/fold1_MeanPooling_microsoft-deberta-v3-large_state_dict.pth',
        '/kaggle/input/fbp3-meta-pseudo-labels-model/SWA_fold2_MeanPooling_microsoft-deberta-v3-large_state_dict.pth',
        '/kaggle/input/fbp3-meta-pseudo-labels-model/fold3_MeanPooling_microsoft-deberta-v3-large_state_dict.pth',
        '/kaggle/input/fbp3-meta-pseudo-labels-model/fold4_MeanPooling_microsoft-deberta-v3-large_state_dict.pth'
    ]
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 32
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 0.8

class CFG6:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/0925-deberta-v3-large-unscale/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.2
    
class CFG7:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/0926-deberta-v3-large-unscale/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.2
    
class CFG8:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/0927-deberta-v3-large-unscale/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.2
    
class CFG9:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/0911-deberta-v3-large/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
class CFG10:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
    model_list = glob.glob('/kaggle/input/0914-deberta-v3-large-fgm/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
class CFG11:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-base'
    model_list = glob.glob('/kaggle/input/0911-deberta-v3-base/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
class CFG12:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-base'
    model_list = glob.glob('/kaggle/input/0913-deberta-v3-base-fgm/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
class CFG13:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v2-xlarge'
    model_list = glob.glob('/kaggle/input/0911-deberta-v2-xlarge/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0
    
class CFG14:
    """ No MPL Fine-Tuned Model """
    model = '/kaggle/input/huggingface-automodel-save/deberta-v2-xlarge'
    model_list = glob.glob('/kaggle/input/0919-deberta-v2-xlarge/*.pth')
    tokenizer = AutoTokenizer.from_pretrained(model)
    pooling = 'MeanPooling'  # mean, attention, max, weightedlayer, concat, conv1d, lstm
    loss_fn = 'SmoothL1Loss'
    seed = 42
    n_gpu = 1
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    gradient_checkpointing = False
    max_len = 512
    batch_size = 16
    n_fold = 10
    num_freeze = 4
    num_reinit = 2
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    weight = 1.0

In [3]:
""" Helper Function """

def check_device() -> bool:
    return torch.mps.is_available()

def check_library(checker: bool) -> tuple:
    """
    1) checker == True
        - current device is mps
    2) checker == False
        - current device is cuda with cudnn
    """
    if not checker:
        _is_built = torch.backends.cudnn.is_available()
        _is_enable = torch.backends.cudnn.enabledtorch.backends.cudnn.enabled
        version = torch.backends.cudnn.version()
        device = (_is_built, _is_enable, version)
        return device

def class2dict(cfg) -> dict:
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))


def all_type_seed(cfg, checker: bool) -> None:
    # python & torch seed
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)  # python Seed
    random.seed(cfg.seed)  # random module Seed
    np.random.seed(cfg.seed)  # numpy module Seed
    torch.manual_seed(cfg.seed)  # Pytorch CPU Random Seed Maker

    # device == cuda
    if not checker:
        torch.cuda.manual_seed(cfg.seed)  # Pytorch GPU Random Seed Maker
        torch.cuda.manual_seed_all(cfg.seed)  # Pytorch Multi Core GPU Random Seed Maker
        # torch.cudnn seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.enabled = False

def seed_worker(worker_id) -> None:
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    

check_library(True)
all_type_seed(CFG3, True)

g = torch.Generator()
g.manual_seed(CFG3.seed)

<torch._C.Generator at 0x71256bcdfd30>

In [4]:
""" Data Utils """

def load_data(data_path: str) -> pd.DataFrame:
    """
    Load data_folder from csv file like as train.csv, test.csv, val.csv
    """
    df = pd.read_csv(data_path)
    return df

def text_preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    For FB3 Text Data
    FB3 Text data_folder has '\n\n', meaning that separate paragraphs are separated by '\n\n'
    DeBERTa does not handle '\n\n' well, so we need to change them into token '[PARAGRAPH]'
    """
    text_list = df['full_text'].values.tolist()
    text_list = [text.replace('\n\n', '[PARAGRAPH] ') for text in text_list]
    df['full_text'] = text_list
    df.reset_index(drop=True, inplace=True)
    return df

def create_word_normalizer():
    """
    Create a function that normalizes a word.
    """
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    def normalize(word):
        w = word.lower()
        w = lemmatizer.lemmatize(w)
        w = ps.stem(w)
        return w
    return normalize

def __normalize_words(titles: list) -> list:
    """
    Normalize a list of words
    1) Remove stop words
    2) Apply Porter Stemmer, Lemmatizer
    """
    stop_words = set(stopwords.words('english'))
    normalizer = create_word_normalizer()
    titles = [normalizer(t) for t in titles if t not in stop_words]
    return titles

def normalize_words(words: np.ndarray, unique=True) -> list:
    """
    Normalize a list of words
    1) Apply __normalize_word function
    2) Apply Regular Expression to remove special characters
    """
    if type(words) is str:
        words = [words]
    sep_re = r'[\s\(\){}\[\];,\.]+'
    num_re = r'\d'
    words = re.split(sep_re, ' '.join(words).lower())
    words = [w for w in words if len(w) >= 3 and not re.match(num_re, w)]
    if unique:
        words = list(set(words))
        words = set(__normalize_words(words))
    else:
        words = __normalize_words(words)
    return words

def collate(inputs):
    """ Descending sort inputs by length of sequence """
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

def get_name(cfg) -> str:
    """ get name of model """
    try:
        name = cfg.model.replace('/', '-')
    except ValueError:
        name = cfg.model
    return name

In [5]:
""" Pooling & Model Utils """

# Mean Pooling
class GEMPooling(nn.Module):
    """
    Generalized Mean Pooling for Natural Language Processing
    This class version of GEMPooling for NLP, Transfer from Computer Vision Task Code
    Mean Pooling <= GEMPooling <= Max Pooling
    Because of doing exponent to each token embeddings, GEMPooling is like as weight to more activation token

    [Reference]
    https://paperswithcode.com/method/generalized-mean-pooling
    """
    def __init__(self, auto_cfg):
        super(GEMPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state, attention_mask, p: int = 4) -> Tensor:
        """
        1) Expand Attention Mask from [batch_size, max_len] to [batch_size, max_len, hidden_size]
        2) Sum Embeddings along max_len axis so now we have [batch_size, hidden_size]
        3) Sum Mask along max_len axis, This is done so that we can ignore padding tokens
        4) Average
        """
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(
            torch.pow(last_hidden_state * input_mask_expanded, p), 1
        )
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        gem_embeddings = torch.pow(sum_embeddings / sum_mask, 1/p)
        return gem_embeddings


# WeightedLayerPooling: Use Intermediate Layer's Embedding
class WeightedLayerPooling(nn.Module):
    """
    For Weighted Layer Pooling Class
    In Original Paper, they use [CLS] token for classification task.
    But in common sense, Mean Pooling more good performance than CLS token Pooling
    So, we append Last part of this Pooling Method, Using CLS Token then Mean Pooling Embedding
    Args:
        auto_cfg: AutoConfig from model class member variable
        layer_start: start layer for pooling, default 4
        layer_weights: layer weights for pooling, default None
    """
    def __init__(self, auto_cfg, layer_start: int = 12, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = auto_cfg.num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (self.num_hidden_layers + 1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states, attention_mask) -> Tensor:
        all_layer_embedding = torch.stack(list(all_hidden_states), dim=0)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(weighted_average.size()).float()
        sum_embeddings = torch.sum(weighted_average * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # if lower than thres, replace value to threshold (parameter min)
        weighted_mean_embeddings = sum_embeddings / sum_mask
        return weighted_mean_embeddings

# Attention pooling
class AttentionPooling(nn.Module):
    """
    [Reference]
    <A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING>
    """
    def __init__(self, auto_cfg):
        super().__init__()
        self.attention = nn.Sequential(
           nn.Linear(auto_cfg.hidden_size, auto_cfg.hidden_size),
           nn.LayerNorm(auto_cfg.hidden_size),
           nn.GELU(),
           nn.Linear(auto_cfg.hidden_size, 1),
        )

    def forward(self, last_hidden_state, attention_mask) -> Tensor:
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float('-inf')
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings


# Mean Pooling
class MeanPooling(nn.Module):
    def __init__(self, auto_cfg):
        super(MeanPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state, attention_mask) -> Tensor:
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


# Max Pooling
class MaxPooling(nn.Module):
    def __init__(self, auto_cfg):
        super(MaxPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings


# Min Pooling
class MinPooling(nn.Module):
    def __init__(self, auto_cfg):
        super(MinPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e-4
        min_embeddings, _ = torch.min(embeddings, dim=1)
        return min_embeddings


# Convolution Pooling
class ConvPooling(nn.Module):
    """
    for filtering unwanted feature such as Toxicity Text, Negative Comment...etc
    kernel_size: similar as window size

    [Reference]
    https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
    """
    def __init__(self, feature_size: int, kernel_size: int, padding_size: int):
        super().__init__()
        self.feature_size = feature_size
        self.kernel_size = kernel_size
        self.padding_size = padding_size
        self.convolution = nn.Sequential(
            nn.Conv1d(self.feature_size, 256, kernel_size=self.kernel_size, padding=self.padding_size),
            nn.ReLU(),
            nn.Conv1d(256, 1, kernel_size=kernel_size, padding=padding_size),
        )

    def forward(self, last_hidden_state: Tensor) -> Tensor:
        embeddings = last_hidden_state.permute(0, 2, 1) # (batch_size, feature_size, seq_len)
        logit, _ = torch.max(self.convolution(embeddings), 2)
        return logit


# LSTM Pooling
class LSTMPooling(nn.Module):
    """
    [Reference]
    https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
    """
    def __int__(self, num_layers: int, hidden_size: int, hidden_dim_lstm):
        super().__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hidden_size = hidden_size
        self.hidden_dim_lstm = hidden_dim_lstm
        self.lstm = nn.LSTM(
            self.hidden_size,
            self.hidden_dim_lstm,
            batch_first=True
        )
        self.dropout = nn.Dropout(0.1)

    def forward(self, all_hidden_states: list) -> Tensor:
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()\
                                    for layer_i in range(1, self.num_hidden_layers)], dim=1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out
    
def freeze(module) -> None:
    """
    Freezes module's parameters.

    [Example]
    freezing embeddings and first 2 layers of encoder
    1) freeze(model.embeddings)
    2) freeze(model.encoder.layer[:2])
    """
    for parameter in module.parameters():
        parameter.requires_grad = False

def get_freeze_parameters(module) -> list:
    """
    Returns names of freezed parameters of the given module.

    [Example]
    freezed_parameters = get_freezed_parameters(model)
    """
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)

    return freezed_parameters

def init_weights(auto_cfg, module) -> None:
    """
    Initializes weights of the given module.
    """
    if isinstance(module, nn.Linear):
        module.weight.data.normal_(mean=0.0, std=auto_cfg.initializer_range)
        if module.bias is not None:
            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
        module.weight.data.normal_(mean=0.0, std=auto_cfg.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)

def reinit_topk(model, num_layers) -> None:
    """
    Re-initialize the last-k transformer Encoder layers.
    Encoder Layer: Embedding, Attention Head, LayerNorm, Feed Forward
    Args:
        model: The target transformer model.
        num_layers: The number of layers to be re-initialized.
    """
    model.encoder.layer[-num_layers:].apply(model._init_weights)
    
def postprocess(pseudo_label):
    """ for post processing to teacher model's prediction(pseudo label) """
    label_dict = torch.arange(1, 5.5, 0.5)
    pseudo_label.squeeze()
    for instance in pseudo_label:
        for idx in range(len(instance)):
            instance[idx] = label_dict[(torch.abs(label_dict - instance[idx]) == min(torch.abs(label_dict - instance[idx]))).nonzero(as_tuple=False)]
    return pseudo_label


class RMSELoss(nn.Module):
    def __init__(self, reduction, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss(reduction=reduction)
        self.eps = eps # If MSE == 0, We need eps

    def forward(self, yhat, y) -> Tensor:
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss

class MCRMSELoss(nn.Module):
    # num_scored => setting your number of metrics
    def __init__(self, reduction, num_scored=6):
        super().__init__()
        self.RMSE = RMSELoss(reduction=reduction)
        self.num_scored = num_scored

    def forward(self, yhat, y):
        score = 0
        for i in range(self.num_scored):
            score = score + (self.RMSE(yhat[:, i], y[:, i]) / self.num_scored)
        return score

In [6]:
""" Test Dataset Class """

class TestDataset(Dataset):
    """ For Inference Dataset Class """
    def __init__(self, cfg, df):
        super().__init__()
        self.cfg = cfg
        self.df = df

    def tokenizing(self, text):
        inputs = self.cfg.tokenizer(
            text,
            max_length=self.cfg.max_len,
            padding='max_length',
            truncation=True,
            return_tensors=None,
            add_special_tokens=True,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v)
        return inputs

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        inputs = self.tokenizing(self.df.iloc[idx, 1])
        return inputs

In [7]:
class TestModel(nn.Module):
    """ Model class for Baseline Pipeline """
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.auto_cfg = AutoConfig.from_pretrained(
            cfg.model,
            output_hidden_states=True
        )
        self.model = AutoModel.from_pretrained(
            cfg.model,
            config=self.auto_cfg
        )
        self.fc = nn.Linear(self.auto_cfg.hidden_size, 6)
        
        if cfg.pooling == 'MeanPooling':
            self.pooling = MeanPooling(self.auto_cfg)
        elif cfg.pooling == 'AttentionPooling':
            self.pooling = AttentionPooling(self.auto_cfg)
        elif cfg.pooling == 'WeightedLayerPooling':
            self.pooling = WeightedLayerPooling(self.auto_cfg)
        elif cfg.pooling == 'GEMPooling':
            self.pooling = GEMPooling(self.auto_cfg)
        self._init_weights(self.fc)
        reinit_topk(self.model, cfg.num_reinit)
        freeze(self.model)

    def _init_weights(self, module) -> None:
        """ over-ride initializes weights of the given module function (+initializes LayerNorm) """
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            """ reference from torch.nn.Layernorm with elementwise_affine=True """
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()

    def feature(self, inputs: dict):
        outputs = self.model(**inputs)
        return outputs

    def forward(self, inputs: dict) -> Tensor:
        outputs = self.feature(inputs)
        feature = outputs.last_hidden_state
        if self.cfg.pooling == 'WeightedLayerPooling':
            feature = outputs.hidden_states
        embedding = self.pooling(feature, inputs['attention_mask'])
        logit = self.fc(embedding)
        return logit

class NoMPLModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling(self.config)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [8]:
""" Make Test Loop's Input """

class FBPTest:
    def __init__(self, cfg, generator):
        self.cfg = cfg
        self.generator = generator
        self.df = text_preprocess(load_data('/kaggle/input/feedback-prize-english-language-learning/test.csv'))
        self.tokenizer = self.cfg.tokenizer

    def make_batch(self):
        test = self.df.reset_index(drop=True)

        # Custom Datasets
        test_dataset = TestDataset(self.cfg, self.tokenizer, test)

        # DataLoader
        loader_test = DataLoader(
            test_dataset,
            batch_size=self.cfg.batch_size,
            shuffle=False,
            worker_init_fn=seed_worker,
            generator=self.generator,
            num_workers=self.cfg.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        return loader_test

    def model_setting(self, model_path):
        model = TestModel(self.cfg)
        model.load_state_dict(
            torch.load(model_path),
            strict=False
        )
        model.to(self.cfg.device)
        return model
    
    def test_fn(self, loader_test, model):
        """ Test Function """
        model.eval()
        y_preds = []
        for step, inputs in enumerate(tqdm(loader_test)):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(self.cfg.device)
            with torch.no_grad():
                preds = model(inputs)
            y_preds.append(preds.to('cpu').numpy())
        return y_preds

In [9]:
""" Test Loop """
def test_loop(cfg: any, model_path) -> pd.DataFrame:
    test_input = FBPTest(cfg, g)  # init object
    loader_test = test_input.make_batch()
    model = test_input.model_setting(model_path)
    y_preds = test_input.test_fn(loader_test, model)
    del test_input, loader_test, model
    gc.collect()
    return y_preds

In [10]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [11]:
# """ Let's Inference """
# sample_submission_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv')
# # test_df = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')
# submission_df = sample_submission_df.copy()
# # final_predictions = np.zeros((len(test_df), len(test_df.iloc[0,1:])))
# final_predictions = 0
# for idx, cfg in enumerate(tqdm(cfg_list)):
# #     cfg_predictions = np.zeros((len(test_df), len(test_df.iloc[0,1:])))
#     cfg_predictions = 0
#     for model_path in tqdm(cfg.model_list):
#         predictions = []
#         predictions.append(test_loop(cfg, model_path))
#         predictions = np.array(predictions).squeeze() #.mean(axis=0)
# #         cfg_predictions += np.add(cfg_predictions, predictions)
#         cfg_predictions += predictions
#     cfg_predictions = cfg_predictions * 1/cfg.n_folds
#     final_predictions += cfg_predictions    
# final_predictions = final_predictions * 1/len(cfg_list)
# submission_df[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']] = final_predictions
# submission_df.reset_index(drop=True, inplace=True)
# submission_df.to_csv('submission.csv', index=False)

In [12]:
cfg_list = [CFG1, CFG2, CFG3, CFG4, CFG5, CFG6, CFG7, CFG8, CFG9, CFG10, CFG11, CFG12, CFG13, CFG14]

for _idx, CFG in enumerate(tqdm(cfg_list)):
    test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
    submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')
    # sort by length to speed up inference
    test['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test['full_text'].values]
    test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

    test_dataset = TestDataset(CFG, test)
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False
    )
    predictions = []
    for fold in tqdm(CFG.model_list):
        if _idx < 5:
            model = TestModel(CFG)
            state = torch.load(fold, map_location=torch.device('cpu'))
            model.load_state_dict(state)
        else:
            model = NoMPLModel(CFG)            
            state = torch.load(fold, map_location=torch.device('cpu'))
            model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, CFG.device)
        predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
    predictions = np.mean(predictions, axis=0)
    test[CFG.target_cols] = predictions
    submission = submission.drop(columns=CFG.target_cols).merge(test[['text_id'] + CFG.target_cols], on='text_id', how='left')
    display(submission.head())
    submission[['text_id'] + CFG.target_cols].to_csv(f'submission_{_idx + 1}.csv', index=False)
    del test, submission, predictions, test_dataset, test_loader; gc.collect()
    torch.cuda.empty_cache() 

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.852191,2.814463,3.124552,2.925644,2.610543,2.707259
1,000BAD50D026,2.615537,2.498952,2.702053,2.355212,2.077511,2.603427
2,00367BB2546B,3.634399,3.531759,3.656657,3.721699,3.583319,3.433978


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.815866,2.75349,3.090136,2.930807,2.608538,2.639577
1,000BAD50D026,2.677987,2.433232,2.677811,2.417471,2.154791,2.610936
2,00367BB2546B,3.579234,3.460128,3.610293,3.676425,3.51815,3.454332


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.882582,2.747907,3.180613,2.900436,2.666481,2.673689
1,000BAD50D026,2.682862,2.498912,2.758105,2.458223,2.309281,2.580015
2,00367BB2546B,3.507451,3.397943,3.559095,3.514899,3.433376,3.39258


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.871905,2.812961,3.049408,2.916902,2.711879,2.707875
1,000BAD50D026,2.695753,2.542526,2.783838,2.460367,2.182987,2.660824
2,00367BB2546B,3.752302,3.609422,3.701724,3.732062,3.624899,3.474257


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.918238,2.794642,3.080143,2.943886,2.754395,2.717892
1,000BAD50D026,2.750212,2.524597,2.78121,2.435717,2.199605,2.661438
2,00367BB2546B,3.737241,3.521809,3.672317,3.703747,3.570585,3.436521


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.808707,2.722469,2.962899,2.831994,2.62227,2.624713
1,000BAD50D026,2.683296,2.532859,2.752749,2.45387,2.230445,2.712891
2,00367BB2546B,3.403564,3.26293,3.451626,3.40007,3.211327,3.166316


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.905139,2.752544,3.061124,2.887842,2.655701,2.661439
1,000BAD50D026,2.618809,2.476544,2.718756,2.394442,2.14419,2.609311
2,00367BB2546B,3.462291,3.321768,3.590484,3.530361,3.362374,3.153605


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.849788,2.700627,3.027683,2.888731,2.669844,2.622761
1,000BAD50D026,2.682458,2.451743,2.726842,2.40892,2.121978,2.678275
2,00367BB2546B,3.478536,3.295372,3.531762,3.515471,3.339186,3.160689


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.914353,2.736979,3.041348,2.928318,2.723809,2.654375
1,000BAD50D026,2.683547,2.490253,2.732694,2.434463,2.18336,2.691896
2,00367BB2546B,3.50602,3.289566,3.546155,3.524339,3.388935,3.196234


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.930743,2.749926,3.05219,2.940608,2.714919,2.670341
1,000BAD50D026,2.673043,2.483314,2.719683,2.422419,2.165154,2.67963
2,00367BB2546B,3.512811,3.306587,3.550534,3.526885,3.379601,3.199553


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.892292,2.764567,3.033052,2.924136,2.731107,2.655365
1,000BAD50D026,2.634624,2.440516,2.727708,2.377986,2.198006,2.607657
2,00367BB2546B,3.474135,3.367373,3.552581,3.51299,3.378452,3.297749


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.889106,2.760818,3.033244,2.926443,2.730445,2.657448
1,000BAD50D026,2.6285,2.431285,2.723917,2.376415,2.201954,2.603708
2,00367BB2546B,3.462257,3.351451,3.533688,3.494831,3.363663,3.283415


Token indices sequence length is longer than the specified maximum sequence length for this model (897 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.821333,2.717752,2.960445,2.905563,2.694906,2.63946
1,000BAD50D026,2.757675,2.561463,2.74514,2.465197,2.213324,2.758724
2,00367BB2546B,3.507787,3.350252,3.6055,3.552052,3.427625,3.342166


Token indices sequence length is longer than the specified maximum sequence length for this model (897 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.94549,2.733668,3.020585,2.929958,2.711853,2.664115
1,000BAD50D026,2.757414,2.56356,2.786097,2.492231,2.28046,2.686676
2,00367BB2546B,3.633416,3.379511,3.617248,3.536336,3.321223,3.336801


In [13]:
""" Two Stage Model: Pretrained & FineTuned Model with cuML SVR Regression Head """
dftr = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/train.csv")
dftr["src"]="train"
dfte = pd.read_csv("/kaggle/input/feedback-prize-english-language-learning/test.csv")
dfte["src"]="test"
print('Train shape:',dftr.shape,'Test shape:',dfte.shape,'Test columns:',dfte.columns)
df = pd.concat([dftr,dfte],ignore_index=True)
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
dftr.head()
FOLDS = 25
skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(dftr,dftr[target_cols])):
    dftr.loc[val_index,'FOLD'] = i
print('Train samples per fold:')
dftr.FOLD.value_counts()

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

# WeightedLayerPooling: Use Intermediate Layer's Embedding
class WeightedLayerPooling(nn.Module):
    """
    For Weighted Layer Pooling Class
    In Original Paper, they use [CLS] token for classification task.
    But in common sense, Mean Pooling more good performance than CLS token Pooling
    So, we append last part of this Pooling Method, Mean Pooling Embedding instad of Using CLS Token
    Args:
        auto_cfg: AutoConfig from model class member variable
        layer_start: how many layers do you want to use, default 21 (last 4 layers)
        layer_weights: layer weights for pooling, default None
    """
    def __init__(self, auto_cfg, layer_start: int = 12, layer_weights=None) -> None:
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = auto_cfg.num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (self.num_hidden_layers + 1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states, attention_mask) -> Tensor:
        all_layer_embedding = torch.stack(list(all_hidden_states), dim=0)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :].detach().cpu()
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(weighted_average.size()).float()
        sum_embeddings = torch.sum(weighted_average * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # if lower than thres, replace value to threshold (parameter min)
        weighted_mean_embeddings = sum_embeddings / sum_mask
        return weighted_mean_embeddings

# Attention pooling
class AttentionPooling(nn.Module):
    """
    [Reference]
    <A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING>
    """
    def __init__(self, auto_cfg) -> None:
        super().__init__()
        self.attention = nn.Sequential(
           nn.Linear(auto_cfg.hidden_size, auto_cfg.hidden_size),
           nn.LayerNorm(auto_cfg.hidden_size),
           nn.GELU(),
           nn.Linear(auto_cfg.hidden_size, 1),
        )

    def forward(self, last_hidden_state, attention_mask) -> Tensor:
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float('-inf')
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

# Mean Pooling
class GEMPooling(nn.Module):
    """
    Generalized Mean Pooling for Natural Language Processing
    This class version of GEMPooling for NLP, Transfer from Computer Vision Task Code

    Mean Pooling <= GEMPooling <= Max Pooling
    Because of doing exponent to each token embeddings, GEMPooling is like as weight to more activation token

    In original paper, they use p=3, but in this class, we use p=4 because torch doesn't support pow calculation
    for negative value tensor, only for non-negative value in odd number exponent
    [Reference]
    https://paperswithcode.com/method/generalized-mean-pooling
    """
    def __init__(self, auto_cfg) -> None:
        super(GEMPooling, self).__init__()

    @staticmethod
    def forward(last_hidden_state, attention_mask, p: int = 4) -> Tensor:
        """
        1) Expand Attention Mask from [batch_size, max_len] to [batch_size, max_len, hidden_size]
        2) Sum Embeddings along max_len axis so now we have [batch_size, hidden_size]
        3) Sum Mask along max_len axis, This is done so that we can ignore padding tokens
        4) Average
        """
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(
            torch.pow(last_hidden_state * input_mask_expanded, p), 1
        )
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        tmp_embeddings = sum_embeddings / sum_mask
        gem_embeddings = torch.pow(tmp_embeddings, 1/p)
        return gem_embeddings
    
BATCH_SIZE = 4

class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"full_text"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,return_tensors="pt")
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens

ds_tr = EmbedDataset(dftr)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)
ds_te = EmbedDataset(dfte)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

train_config = False
tokenizer = None
MAX_LEN = 640

def meanpooling_get_embeddings(finetuned_weight=None, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN, train_config
    DEVICE="cuda"
    auto_cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    model = AutoModel.from_pretrained(MODEL_NM, config=auto_cfg)
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    MAX_LEN = MAX
    if finetuned_weight is not None:
        model.load_state_dict(torch.load(finetuned_weight, map_location='cuda:0'), strict=False)
    model = model.to(DEVICE)
    model.eval()
    
    if train_config:
        all_train_text_feats = []
        for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Good
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            all_train_text_feats.extend(sentence_embeddings)
        all_train_text_feats = np.array(all_train_text_feats)
        if verbose:
            print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return te_text_feats

def weightedlayer_get_embeddings(finetuned_weight=None, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN, train_config
    DEVICE="cuda"
    auto_cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    model = AutoModel.from_pretrained(MODEL_NM, config=auto_cfg)
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    WeightedLayerPool = WeightedLayerPooling(auto_cfg)
    MAX_LEN = MAX
    
    if finetuned_weight is not None:
        model.load_state_dict(torch.load(finetuned_weight, map_location='cuda:0'), strict=False)
   
    model = model.to(DEVICE)
    model.eval()
    
    if train_config:
        all_train_text_feats = []
        for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = WeightedLayerPool(
                model_output.hidden_states,
                attention_mask.detach().cpu()
            )
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Good
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            all_train_text_feats.extend(sentence_embeddings)
        all_train_text_feats = np.array(all_train_text_feats)
        if verbose:
            print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = WeightedLayerPool(
            model_output.hidden_states,
            attention_mask.detach().cpu()
        )
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return te_text_feats

def four_weightedlayer_get_embeddings(finetuned_weight=None, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN, train_config
    DEVICE="cuda"
    auto_cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    model = AutoModel.from_pretrained(MODEL_NM, config=auto_cfg)
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    WeightedLayerPool = WeightedLayerPooling(auto_cfg, layer_start=4)
    MAX_LEN = MAX
    
    if finetuned_weight is not None:
        model.load_state_dict(torch.load(finetuned_weight, map_location='cuda:0'), strict=False)
    
    model = model.to(DEVICE)
    model.eval()
    
    if train_config: 
        all_train_text_feats = []
        for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = WeightedLayerPool(
                model_output.hidden_states,
                attention_mask.detach().cpu()
            )
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Good
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            all_train_text_feats.extend(sentence_embeddings)
        all_train_text_feats = np.array(all_train_text_feats)
        if verbose:
            print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = WeightedLayerPool(
            model_output.hidden_states,
            attention_mask.detach().cpu()
        )
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return te_text_feats

def gempooling_get_embeddings(finetuned_weight=None, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN, train_config
    DEVICE="cuda"
    auto_cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    model = AutoModel.from_pretrained(MODEL_NM, config=auto_cfg)
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    GEMPool = GEMPooling(auto_cfg)
    MAX_LEN = MAX
    
    if finetuned_weight is not None:
        model.load_state_dict(torch.load(finetuned_weight, map_location='cuda:0'), strict=False)
    
    model = model.to(DEVICE)
    model.eval()
    
    if train_config:
        all_train_text_feats = []
        for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = GEMPool(
                model_output.last_hidden_state.detach().cpu(),
                attention_mask.detach().cpu()
            )
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Good
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            all_train_text_feats.extend(sentence_embeddings)
        all_train_text_feats = np.array(all_train_text_feats)
        if verbose:
            print('Train embeddings shape',all_train_text_feats.shape)
        
    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = GEMPool(
            model_output.last_hidden_state.detach().cpu(),
            attention_mask.detach().cpu()
        )        
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return te_text_feats

def attentionpooling_get_embeddings(finetuned_weight=None, MODEL_NM='', MAX=640, BATCH_SIZE=4, verbose=True):
    global tokenizer, MAX_LEN, train_config
    DEVICE="cuda"
    auto_cfg = AutoConfig.from_pretrained(MODEL_NM, output_hidden_states=True)
    model = AutoModel.from_pretrained(MODEL_NM, config=auto_cfg)
    tokenizer = AutoTokenizer.from_pretrained( MODEL_NM )
    AttentionPool = AttentionPooling(auto_cfg)
    MAX_LEN = MAX
    
    if finetuned_weight is not None:
        model.load_state_dict(torch.load(finetuned_weight, map_location='cuda:0'), strict=False)
   
    model = model.to(DEVICE)
    model.eval()
    
    if train_config:
        all_train_text_feats = []
        for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            with torch.no_grad():
                model_output = model(input_ids=input_ids,attention_mask=attention_mask)
            sentence_embeddings = AttentionPool(
                model_output.last_hidden_state.detach().cpu(),
                attention_mask.detach().cpu()
            )
            # Normalize the embeddings
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Good
            sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
            all_train_text_feats.extend(sentence_embeddings)
        all_train_text_feats = np.array(all_train_text_feats)
        if verbose:
            print('Train embeddings shape',all_train_text_feats.shape)

    te_text_feats = []
    for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        with torch.no_grad():
            model_output = model(input_ids=input_ids,attention_mask=attention_mask)
        sentence_embeddings = AttentionPool(
            model_output.last_hidden_state.detach().cpu(),
            attention_mask.detach().cpu()
        )
        # Normalize the embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
        te_text_feats.extend(sentence_embeddings)
    te_text_feats = np.array(te_text_feats)
    if verbose:
        print('Test embeddings shape',te_text_feats.shape)
        
    return te_text_feats

MODEL_NM = '../input/deberta-v3-large/deberta-v3-large'
class FineTunedModel:
    MeanPool_Model = glob.glob('/kaggle/input/0927-deberta-v3-large-unscale/*.pth')
    GEMPool_Model = glob.glob('/kaggle/input/fbp3-gempooling-max-len-1536-04476/*.pth')
    WeightedLayer_Model = glob.glob('/kaggle/input/fbp3-weightedlayerpooling-04545/*.pth')

Train shape: (3911, 9) Test shape: (3, 3) Test columns: Index(['text_id', 'full_text', 'src'], dtype='object')
Train samples per fold:


In [14]:
""" Train Stage """
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
MODEL_NM = '/kaggle/input/huggingface-automodel-save/deberta-v3-large'
class FineTunedModel:
    MeanPool_Model = glob.glob('/kaggle/input/0927-deberta-v3-large-unscale/*.pth')
    GEMPool_Model = glob.glob('/kaggle/input/fbp3-gempooling-max-len-1536-04476/*.pth')
    WeightedLayer_Model = glob.glob('/kaggle/input/fbp3-weightedlayerpooling-04545/*.pth')
    
te_text_feats1 = meanpooling_get_embeddings(MODEL_NM=MODEL_NM)
te_text_feats2 = weightedlayer_get_embeddings(MODEL_NM=MODEL_NM)
te_text_feats3 = four_weightedlayer_get_embeddings(MODEL_NM=MODEL_NM)
te_text_feats4 = gempooling_get_embeddings(MODEL_NM=MODEL_NM)
te_text_feats5 = attentionpooling_get_embeddings(MODEL_NM=MODEL_NM, MAX=512)

# all_train_text_feats = np.concatenate([all_train_text_feats,all_train_text_feats2,
#                                        all_train_text_feats3,all_train_text_feats4,
#                                        all_train_text_feats5],axis=1)
""" Baseline Train Embeddings """
all_train_text_feats = np.load('/kaggle/input/fbp3-svr-train-embeddings/baseline_train_embedding.npy')
te_text_feats = np.concatenate([
    te_text_feats1,
    te_text_feats2,
    te_text_feats3,
    te_text_feats4,
    te_text_feats5
],axis=1)

# del all_train_text_feats2, te_text_feats2
# del all_train_text_feats3, te_text_feats3
# del all_train_text_feats4, te_text_feats4

del te_text_feats2
del te_text_feats3
del te_text_feats4

""" Inference Stage 1"""
from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    global target_cols
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

#for fold in tqdm(range(FOLDS),total=FOLDS):
for fold in range(FOLDS):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    dftr_ = dftr[dftr["FOLD"]!=fold]
    dfev_ = dftr[dftr["FOLD"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        print(t,', ',end='')
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
    print()
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
    print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    
print('#'*25)
print('Overall CV RSME =',np.mean(scores))

sub = dfte.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_15.csv",index=None)

""" Inference Stage 2 """
""" load Fine-Tuned Model at number two variable """
te_text_feats2 = 0
for model in tqdm(FineTunedModel.WeightedLayer_Model):
    tmp_te_text_feats = weightedlayer_get_embeddings(
        model, 
        MODEL_NM=MODEL_NM
    )
    te_text_feats2 += tmp_te_text_feats
te_text_feats2 = te_text_feats2 / 5

""" load Fine-Tuned Model at number three variable """
te_text_feats3 = 0
for model in tqdm(FineTunedModel.WeightedLayer_Model):
    tmp_te_text_feats = four_weightedlayer_get_embeddings(
        model,
        MODEL_NM=MODEL_NM
    )
    te_text_feats3 += tmp_te_text_feats
te_text_feats3 = te_text_feats3 / 5

""" load Fine-Tuned Model at number four variable """
te_text_feats4 = 0
for model in tqdm(FineTunedModel.WeightedLayer_Model):
    tmp_te_text_feats = gempooling_get_embeddings(
        model,
        MODEL_NM=MODEL_NM
    )
    te_text_feats4 += tmp_te_text_feats
te_text_feats4 = te_text_feats4 / 5

all_train_text_feats = np.load('/kaggle/input/fbp3-svr-train-embeddings/train_embedding.npy')
te_text_feats = np.concatenate([
    te_text_feats1,
    te_text_feats2,
    te_text_feats3,
    te_text_feats4,
    te_text_feats5]
, axis=1)

""" Inference Stage 2"""
from sklearn.metrics import mean_squared_error
preds = []
scores = []
def comp_score(y_true,y_pred):
    global target_cols
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

#for fold in tqdm(range(FOLDS),total=FOLDS):
for fold in range(FOLDS):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    dftr_ = dftr[dftr["FOLD"]!=fold]
    dfev_ = dftr[dftr["FOLD"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        print(t,', ',end='')
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
    print()
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
    print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    
print('#'*25)
print('Overall CV RSME =',np.mean(scores))

sub = dfte.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_16.csv",index=None)

del te_text_feats2
del te_text_feats3
del te_text_feats4
gc.collect()

print('Our concatenated embeddings have shape', all_train_text_feats.shape )

  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)
#########################
### Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 0 RSME score: 0.45455565750401217
#########################
### Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 1 RSME score: 0.45118534040919106
#########################
### Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 2 RSME score: 0.46060901547360866
#########################
### Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 3 RSME score: 0.445561795011782
#########################
### Fold 5
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 4 RSME score: 0.4458096112597782
#########################
### Fold 6
#########################
cohesion , syntax , vocabula

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)


  0%|          | 0/1 [00:00<?, ?it/s]

Test embeddings shape (3, 1024)
#########################
### Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 0 RSME score: 0.4546352449487106
#########################
### Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 1 RSME score: 0.45149397206156044
#########################
### Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 2 RSME score: 0.4605507540373644
#########################
### Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 3 RSME score: 0.44575200847249546
#########################
### Fold 5
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , 
Fold : 4 RSME score: 0.44576194994566976
#########################
### Fold 6
#########################
cohesion , syntax , vocabul

In [None]:
""" Inference Stage """
from sklearn.metrics import mean_squared_error

preds = []
scores = []
def comp_score(y_true,y_pred):
    rmse_scores = []
    for i in range(len(target_cols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
    return np.mean(rmse_scores)

#for fold in tqdm(range(FOLDS),total=FOLDS):
for fold in range(FOLDS):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    dftr_ = dftr[dftr["FOLD"]!=fold]
    dfev_ = dftr[dftr["FOLD"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    ev_preds = np.zeros((len(ev_text_feats),6))
    test_preds = np.zeros((len(te_text_feats),6))
    for i,t in enumerate(target_cols):
        print(t,', ',end='')
        clf = SVR(C=1)
        clf.fit(tr_text_feats, dftr_[t].values)
        ev_preds[:,i] = clf.predict(ev_text_feats)
        test_preds[:,i] = clf.predict(te_text_feats)
    print()
    score = comp_score(dfev_[target_cols].values,ev_preds)
    scores.append(score)
    print("Fold : {} RSME score: {}".format(fold,score))
    preds.append(test_preds)
    
print('#'*25)
print('Overall CV RSME =',np.mean(scores))

sub = dfte.copy()

sub.loc[:,target_cols] = np.average(np.array(preds),axis=0) #,weights=[1/s for s in scores]
sub_columns = pd.read_csv("../input/feedback-prize-english-language-learning/sample_submission.csv").columns
sub = sub[sub_columns]
sub.to_csv("submission_9.csv",index=None)

In [15]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

sub1 = pd.read_csv(f'submission_1.csv')[CFG1.target_cols] * CFG1.weight
sub2 = pd.read_csv(f'submission_2.csv')[CFG2.target_cols] * CFG2.weight
sub3 = pd.read_csv(f'submission_3.csv')[CFG3.target_cols] * CFG3.weight
sub4 = pd.read_csv(f'submission_4.csv')[CFG4.target_cols] * CFG4.weight
sub5 = pd.read_csv(f'submission_5.csv')[CFG5.target_cols] * CFG5.weight
sub6 = pd.read_csv(f'submission_6.csv')[CFG6.target_cols] * CFG6.weight
sub7 = pd.read_csv(f'submission_7.csv')[CFG7.target_cols] * CFG7.weight
sub8 = pd.read_csv(f'submission_8.csv')[CFG8.target_cols] * CFG8.weight
sub9 = pd.read_csv(f'submission_9.csv')[CFG9.target_cols] * CFG9.weight
sub10 = pd.read_csv(f'submission_10.csv')[CFG10.target_cols] * CFG10.weight
sub11 = pd.read_csv(f'submission_11.csv')[CFG11.target_cols] * CFG11.weight
sub12 = pd.read_csv(f'submission_12.csv')[CFG12.target_cols] * CFG12.weight
sub13 = pd.read_csv(f'submission_13.csv')[CFG13.target_cols] * CFG13.weight
sub14 = pd.read_csv(f'submission_14.csv')[CFG14.target_cols] * CFG14.weight
# sub15 = pd.read_csv(f'submission_15.csv')[CFG1.target_cols] * 1.0
# sub16 = pd.read_csv(f'submission_16.csv')[CFG1.target_cols] * 1.0


# ens = (sub1 + sub2 + sub3 + sub4 + sub5 + sub6 + sub7 + sub8 + sub9 + sub10 + sub11 + sub12 + sub13 + sub14 + sub15 + sub16)/(CFG1.weight + CFG2.weight + CFG3.weight + CFG4.weight + CFG5.weight + CFG6.weight + CFG7.weight + CFG8.weight + CFG9.weight + CFG10.weight + CFG11.weight + CFG12.weight + CFG13.weight + CFG14.weight + 1.0 + 1.0)
ens = (sub1 + sub2 + sub3 + sub4 + sub5 + sub6 + sub7 + sub8 + sub9 + sub10 + sub11 + sub12 + sub13 + sub14)/(CFG1.weight + CFG2.weight + CFG3.weight + CFG4.weight + CFG5.weight + CFG6.weight + CFG7.weight + CFG8.weight + CFG9.weight + CFG10.weight + CFG11.weight + CFG12.weight + CFG13.weight + CFG14.weight)

submission[CFG1.target_cols] = ens
display(submission.head())
submission.to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.890979,2.758132,3.062877,2.927794,2.689081,2.664978
1,000BAD50D026,2.664568,2.477626,2.723253,2.404711,2.162099,2.641702
2,00367BB2546B,3.535244,3.381122,3.574189,3.56717,3.406775,3.299109


In [None]:
# """ Result Weight Tuning """
# # Final_Score = (W1 * R1) + (W2 * R2) + (W3 * R3) + (W4 * R4) + (W5 * R5)
# final_submission = sample_submission_df.copy()
# CFG1_submission = pd.read_csv('/kaggle/working/CFG1_submission.csv')
# CFG2_submission = pd.read_csv('/kaggle/working/CFG2_submission.csv')
# #CFG3_submission = pd.read_csv('/kaggle/working/CFG3_submission.csv')
# #CFG4_submission = pd.read_csv('/kaggle/working/CFG4_submission.csv')
# #CFG5_submission = pd.read_csv('/kaggle/working/CFG5_submission.csv')

# result_list = [
#     CFG1_submission,
#     CFG2_submission,
# #    CFG3_submission,
# #    CFG4_submission,
# #    CFG5_submission
# ]

# final_predictions = np.zeros((len(sample_submission_df), len(sample_submission_df.iloc[0,1:])))
# for result in result_list:
#     tmp_predictions = np.array(result.iloc[:, 1:7])
#     final_predictions = np.add(final_predictions, tmp_predictions)
# final_predictions = final_predictions * 1/len(result_list)
# final_submission[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']] = final_predictions
# final_submission.reset_index(drop=True)
# final_submission.to_csv('submission.csv', index=False)