In [None]:
!pip install wandb
!pip install transformers
!pip install sentencepiece # for auto-tokenizers

In [None]:
import wandb
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim.swa_utils as swa
import tokenizers, transformers
import os, sys, gc, time, random, warnings, math

from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
from tqdm.auto import tqdm
from glob import glob
warnings.filterwarnings("ignore")
%env TOKENIZERS_PARALLELISM=false
%matplotlib inline

In [None]:
# WandB Login => Copy API Key
secret_value_0 = '8d7716caaaa5afb56e1d02ef5837cabbffe48b41'
!wandb login $secret_value_0

In [None]:
""" Train Configuration Class """

class CFG1:
    wandb, train, competition, seed, cfg_name = True, True, 'LECR', 42, 'CFG1'
    device, gpu_id = torch.device('cuda' if torch.cuda.is_available() else 'cpu'), 0
    reranker = 'paraphrase-multilingual-mpnet-base-v2-exp_fold0_epochs10'
    reranker_tokenizer = AutoTokenizer.from_pretrained(reranker + '/tokenizer')
    pooling = 'attention' # options: attention, mean, weightedlayer, concat
    max_len = 256
    n_folds = 5 # cross val
    loss_fn = 'BCE' # options: BCE, RMSE
    epochs = 1
    batch_size = 128 # 64 to 128
    optimizer = 'AdamW' # options: SWA, AdamW
    weight_decay = 1e-6
    scheduler = 'cosine' # options: cosine, linear
    num_cycles = 0.5
    warmup_ratio = 0.1
    batch_scheduler = True
    encoder_lr = 5e-5
    decoder_lr = 5e-5
    min_lr = 1e-6
    max_grad_norm = 1.0 # clip_grad_norm
    gradient_checkpointing = True
    num_workers = 0
    amp_scaler = True
    eps = 1e-6
    betas = (0.9, 0.999)
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 0.01
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False

In [None]:
""" Configuration class to dict """

def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

In [None]:
""" pytorch reproducibility functions """

qdef all_type_seed(CFG):
    os.environ['PYTHONHASHSEED'] = str(CFG.seed) # python Seed 
    random.seed(CFG.seed) # random module Seed
    np.random.seed(CFG.seed) # numpy module Seed
    torch.manual_seed(CFG.seed)

    torch.manual_seed(CFG.seed) # Pytorch CPU Random Seed Maker 
    torch.cuda.manual_seed(CFG.seed) # Pytorch GPU Random Seed Maker 
    torch.cuda.manual_seed_all(CFG.seed) # Pytorch Multi Core GPU Random Seed Maker 

    torch.backends.cudnn.deterministic = True 
    torch.backends.cudnn.benchmark = False 
    torch.backends.cudnn.enabled = False
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() %2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
all_type_seed(CFG1)    
g = torch.Generator()
g.manual_seed(0)

In [None]:
""" Load Data & Preprocess Function """

correlations = pd.read_csv('paraphrase-multilingual-mpnet-base-v2_kfold_0.csv')
dataset = f'CFG1_train.csv'
def load_data(dataset):
#     # Merge All DataSet
#     df= pd.DataFrame(columns=['topics_ids', 'topics_language', 'content_ids',
#                               'content_language','title1', 'title2', 'target'])
#     # drop duplicate row
#     for dataset in dataset_list[0:2]:
#         dataset = pd.read_csv(dataset)
#         df = pd.concat([df,dataset], sort=True)
#     df.drop_duplicates(['topics_ids', 'topics_language', 'content_ids',
#                         'content_language','title1', 'title2', 'target'], inplace=True)
    df = pd.read_csv(dataset)
    
    # fill NaNa
    df['title1'].fillna("", inplace=True)
    df['title2'].fillna("", inplace=True)
    
    # Merge topic.title & content.title
    df['text'] = df['title1'] + '[SEP]' + df['title2']
    df['text'].fillna("", inplace=True)

    
    # preprocess
    df['target'] = df['target'].astype('float')
    df.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"Newdataset.shape: {df.shape}")
    return df

In [None]:
"""
Model Class for Re-Ranker Model(Sentence Transformers)
"""

class NewDataset(Dataset):
    def __init__(self, df, CFG):
        super().__init__()
        self.text = df['text'].values
        self.labels = df['target'].values
        self.cfg = CFG
        
    def tokenizing(self, text_data):
        inputs = self.cfg.reranker_tokenizer.encode_plus(text_data, 
                                                         return_tensors=None, # if true, tf.tensor, pt.tensor, numpy
                                                         add_special_tokens=True,
                                                         truncation=True,
                                                         padding='max_length',
                                                         max_length=self.cfg.max_len)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        
        return inputs
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        inputs = self.tokenizing(self.text[idx]) # tokenizing
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return inputs, label

In [None]:
""" Pooling Functions """
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(nn.Linear(in_dim, in_dim),
                                       nn.LayerNorm(in_dim),
                                       nn.GELU(),
                                       nn.Linear(in_dim, 1),)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

# Mean Pooling
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# WeightedLayer Pooling
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, features):
        ft_all_layers = features['all_layer_embeddings']
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]

        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        features.update({'token_embeddings': weighted_average})
        return features

In [None]:
""" Model classs for Re-Ranker Model """

class ReRankerLM(nn.Module):
    def __init__(self, CFG):
        super().__init__()
        self.cfg = CFG
        self.auto_cfg = AutoConfig.from_pretrained(CFG.reranker + '/config',
                                                   output_hidden_states = True)
        self.model = AutoModel.from_pretrained(CFG.reranker + '/model',
                                               config = self.auto_cfg)
        self.fc = nn.Linear(self.auto_cfg.hidden_size, 1)
        self._init_weights(self.fc)
        
        # pooling
        if self.cfg.pooling == 'attention':
            self.pooling = AttentionPooling(self.auto_cfg.hidden_size)
        elif self.cfg.pooling == 'mean':
            self.pooling = MeanPooling()
        elif self.cfg.pooling == 'weightedlayer':
            self.pooling = WeightedLayerPooling(self.auto_cfg.num_hidden_layers, layer_weights = None)

        # checkpointing
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
         
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, inputs):
        outputs = self.model(**inputs) # inputs from LECRDataset
        last_hidden_state = outputs.last_hidden_state
        embedding = self.pooling(last_hidden_state,
                                 inputs['attention_mask'])
        output = self.fc(embedding)
        return output

In [None]:
""" Loss, Metric Tracker, Collate Function  """

class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps # If MSE == 0, We need eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss
    
class AverageMeter(object):
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
""" Trainer Input Class """

class TrainInput():
    def __init__(self, cfg):
        self.cfg = cfg
        self.df = load_data(dataset) # return dataset
        self.save_parameter = f'(best_score) {self.cfg.reranker}_state_dict.pth'
    
    # LLRD 
    def get_optimizer_grouped_parameters(self, model, layerwise_lr, layerwise_weight_decay, layerwise_lr_decay):
        no_decay = ["bias", "LayerNorm.weight"]
        # initialize lr for task specific layer
        optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if "model" not in n],
                                         "weight_decay": 0.0,
                                         "lr": layerwise_lr,
                                        },]
        # initialize lrs for every layer
        layers = [model.model.embeddings] + list(model.model.encoder.layer)
        layers.reverse()
        lr = layerwise_lr
        for layer in layers:
            optimizer_grouped_parameters += [{"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                                              "weight_decay": layerwise_weight_decay,
                                              "lr": lr,
                                             },
                                             {"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                                              "weight_decay": 0.0,
                                              "lr": lr,
                                             },]
            lr *= layerwise_lr_decay
        return optimizer_grouped_parameters
        
    def make_batch(self, fold):
        train = self.df[self.df['fold'] != fold]
        valid = self.df[self.df['fold'] == fold]
        valid_labels = valid['target'].values

        # Custom Dataset
        train_dataset = NewDataset(train, self.cfg)
        valid_dataset = NewDataset(valid, self.cfg)
        
        # DataLoader
        loader_train = DataLoader(
            train_dataset,
            batch_size = self.cfg.batch_size,
            shuffle = True,
            worker_init_fn=seed_worker,
            generator=g,
            num_workers = self.cfg.num_workers,
            pin_memory = True,
            drop_last = True,
        )
        
        loader_valid = DataLoader(
            valid_dataset,
            batch_size = self.cfg.batch_size,
            shuffle = False,
            worker_init_fn=seed_worker,
            generator=g,
            num_workers = self.cfg.num_workers,
            pin_memory = True,
            drop_last = False,
        )
        
        return loader_train, loader_valid, train, valid

    def model_setting(self):
        # model
        # Re-Initialze Weights of Encoder 
        model = ReRankerLM(self.cfg)
        model.load_state_dict(torch.load('stage3_multi-lingual-mpnet.pth'))
        model.to(self.cfg.device)
        
        # Setting Loss_Function
        if self.cfg.loss_fn == 'BCE':
            criterion = nn.BCEWithLogitsLoss()
        else:
            criterion = RMSELoss()
            
        # optimizer
        grouped_optimizer_params = self.get_optimizer_grouped_parameters(
            model, 
            self.cfg.layerwise_lr, 
            self.cfg.layerwise_weight_decay, 
            self.cfg.layerwise_lr_decay
        )
        optimizer = AdamW(
            grouped_optimizer_params,
            lr = self.cfg.layerwise_lr,
            eps = self.cfg.layerwise_adam_epsilon,
            correct_bias = not self.cfg.layerwise_use_bertadam)
        return model, criterion, optimizer, self.save_parameter

In [None]:
"""
Extract Embedding, Calculate Competiton Metric
"""
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        # if instance bigger than threshold, return 1 
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])        
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [None]:
# Step 3.1 Train & Validation Function
def train_fn(cfg, loader_train, loader_valid, model, criterion, optimizer, scheduler, valid):
    # Train Stages
    # torch.amp.gradscaler
    if cfg.amp_scaler:
        scaler = torch.cuda.amp.GradScaler(enabled = True)
    score_list = [] # All Fold's average of mean F2-Score
    losses = AverageMeter()
    model.train()
    for inputs, target in tqdm(loader_train):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.device) # train to gpu
        target = target.to(cfg.device) # label to gpu
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            optimizer.zero_grad()
            preds = model(inputs)
            loss = criterion(preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
            
    # Validation Stage
    valid_losses = AverageMeter()
    preds_list, target_list = [], []
    model.eval()
    with torch.no_grad():
        for inputs, target in tqdm(loader_valid):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            target_list.append(target)
            target = target.to(cfg.device)
            batch_size = target.size(0)
            preds = model(inputs)
            valid_loss = criterion(preds.view(-1), target)
            valid_losses.update(valid_loss.item(), batch_size)
            preds_list.append(preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))            
            
    predictions = np.concatenate(preds_list, axis = 0)
    f2_score, fold_thres = get_best_threshold(valid, predictions, correlations)
    fold_score = np.mean(f2_score)
    
    return losses.avg, valid_losses.avg, fold_score, fold_thres

In [None]:
""" Train/Validation Loop """


cfg_list = [CFG1]
for cfg in cfg_list:
    # init wandb
    wandb.init(project="[kaggle] LECR", 
               name=cfg.reranker,
               config=class2dict(cfg),
               group=cfg.reranker,
               job_type="train",
               entity = "qcqced")
    wandb_config = wandb.config
    print(f'========================= Re-Ranker Model :{cfg.reranker} =========================')
    train_input = TrainInput(cfg) # init object
    model, criterion, optimizer, save_parameter = train_input.model_setting()
    val_score_max, best_thres = -np.inf, 0
    for epoch in range(cfg.epochs):
        fold_list = [i for i in range(cfg.n_folds)]
        epoch_train_loss, epoch_valid_loss, reranker_score, epoch_thres = [], [], [], []
        for fold in tqdm(fold_list[1:2]):
            print(f'============== {fold}th Fold Train & Validation ==============')
            loader_train, loader_valid, train, valid = train_input.make_batch(fold)
            # scheduler
            if cfg.scheduler == 'cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, 
                    num_warmup_steps=int(len(train)/cfg.batch_size * cfg.epochs) * cfg.warmup_ratio,
                    num_training_steps=int(len(train)/cfg.batch_size * cfg.epochs),
                    num_cycles = cfg.num_cycles
                )  
            else:
                scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=int(len(train)/cfg.batch_size * cfg.epochs) * cfg.warmup_ratio,
                    num_training_steps=int(len(train) /cfg.batch_size * cfg.epochs),
                    num_cycles = cfg.num_cycles
                )  

            train_loss, valid_loss, fold_score, fold_thres = train_fn(
                cfg,
                loader_train,
                loader_valid,
                model,
                criterion,
                optimizer,
                scheduler,
                valid
            )
            epoch_train_loss.append(train_loss)
            epoch_valid_loss.append(valid_loss)
            reranker_score.append(fold_score)
            epoch_thres.append(fold_thres)
            wandb.log({'Fold Train Loss': train_loss,
                       'Fold Valid Loss': valid_loss,
                       'Fold Mean F2-Score': fold_score,
                       'Fold Best Threshold': fold_thres,})

            print(f'fold[{fold}/{fold_list[-1]}] Train Loss: {round(train_loss, 4)}') # Best Threshold Value
            print(f'fold[{fold}/{fold_list[-1]}] Valid Loss: {round(valid_loss, 4)}') # Best Threshold Value
            print(f'fold[{fold}/{fold_list[-1]}] Mean F2-Score: {round(fold_score, 4)}') # Best Threshold Value
        
        epoch_train_loss = np.mean(epoch_train_loss)
        epoch_valid_loss = np.mean(epoch_valid_loss)
        epoch_score = np.mean(reranker_score)
        epoch_thres = np.mean(epoch_thres)
        wandb.log({'Train Loss': epoch_train_loss,
                   'Valid Loss': epoch_valid_loss,
                   'Mean F2-Score': epoch_score,
                   'Threshold': epoch_thres,})
        
        print(f'================= {epoch}th Train & Validation =================')
        print(f'epoch[{epoch+1}/{cfg.epochs}] Train Loss: {round(epoch_train_loss, 4)}')
        print(f'epoch[{epoch+1}/{cfg.epochs}] Valid Loss: {round(epoch_valid_loss, 4)}')
        print(f'epoch[{epoch+1}/{cfg.epochs}] Mean F2-Score: {round(epoch_score, 4)}')
        print(f'epoch[{epoch+1}/{cfg.epochs}] Threshold: {round(epoch_thres, 4)}')
        
        if val_score_max <= epoch_score:
            print(f'[Update] Valid Score : ({val_score_max:.4f} => {epoch_score:.4f}) Save Parameter')
            torch.save(model.state_dict(),
                       'stage3_multi-lingual-mpnet.pth')
            best_thres = epoch_thres
            
    wandb.finish()

[Train History]
- 1th Epoch: 4, 3, 2, 1, 0
- 2th Epoch: 2, 0, 3, 4, 1

[Train Config History]
- batch size: 128