In [None]:
import os
import re

import numpy as np
import pandas as pd

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
import transformers

import pytorch_lightning as pl
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from sklearn import metrics


In [None]:
# Config

cfg01 = dict(
    fold_num=5,  # Unused
    seed=123,
    model='../input/deberta-v3-large/deberta-v3-large',
    path='',
    max_len=400,
    train_bs=4,
    valid_bs=24,
    accumulate_grad_batches=4,
    workers=2,
    gradient_clip_val=1000,
    learning_rate=2e-5,
)

data_folder = '../input/'

In [None]:
cfg = cfg01

seed_everything(cfg['seed'])
tokenizer = AutoTokenizer.from_pretrained(cfg['model'])
sep = tokenizer.sep_token

## Load Data

In [None]:
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    
    for file_name in os.listdir(f'{data_folder}/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'{data_folder}/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

cpc_texts = get_cpc_texts()

df_train = pd.read_csv(f'{data_folder}/us-patent-phrase-to-phrase-matching/train.csv')
df_test = pd.read_csv(f'{data_folder}/us-patent-phrase-to-phrase-matching/test.csv')

### Preprocessing

In [None]:
df_train['type'] = 'train'
df_test['tyoe'] = 'test'
df_test['score'] = np.nan

df_all = pd.concat([df_test, df_train], axis=0)

df_all['context_text'] = df_all['context'].map(cpc_texts).apply(lambda x:x.lower())
df_all = df_all.join(df_all.groupby('anchor').target.agg(list).rename('ref'), on='anchor')
df_all['ref2'] = df_all.apply(lambda x:[i for i in x['ref'] if i != x['target']], axis=1)
df_all['ref2'] = df_all.ref2.apply(lambda x: ', '.join(sorted(list(set(x)), key=x.index)))
df_all['ref'] = df_all.ref.apply(lambda x:', '.join(sorted(list(set(x)), key=x.index)))

df_all = df_all.join(df_all.groupby(['anchor', 'context']).target.agg(list).rename('ref3'), on=['anchor', 'context'])
df_all['ref3'] = df_all.apply(lambda x: ', '.join([i for i in x['ref3'] if i != x['target']]), axis=1)

df_all = df_all.join(df_all.groupby('context').anchor.agg('unique').rename('anchor_list'), on='context')
df_all['anchor_list'] = df_all.apply(lambda x:', '.join([i for i in x['anchor_list'] if i != x['anchor']]), axis=1)

df_all['text1'] = df_all['anchor'] + sep + df_all['target'] + sep  + df_all['context_text']
df_all['text2'] = df_all['anchor'] + sep + df_all['target'] + sep  + df_all['context_text'] + sep  + df_all['ref']
df_all['text3'] = df_all['anchor'] + sep + df_all['target'] + sep  + df_all['context_text'] + sep  + df_all['ref2']
df_all['text4'] = df_all['anchor'] + sep + df_all['target'] + sep  + df_all['context_text'] + sep  + df_all['ref2'] + ', ' + df_all['anchor_list']
df_all['text5'] = df_all['anchor'] + sep + df_all['target'] + sep  + df_all['context_text'] + sep  + df_all['ref3']
df_all['text6'] = 'The similarity between anchor ' + df_all['anchor'] + ' and target ' + df_all['target'] + '. Context is ' + df_all['context_text'] + '. Candidates are ' + df_all['ref3']

# Select the one used
df_all['input'] = df_all.text3

df_all.head(2)

In [None]:
df_all['out'] = pd.get_dummies(df_all.score, prefix='score').agg(list, axis=1)
df_all[df_all.type == 'train']['out'].head(2)

In [None]:
from sklearn.model_selection import train_test_split

df_train = df_all[df_all.type == 'train'].copy()
df_test = df_all[df_all.type == 'test'].copy()

df_train, df_val = train_test_split(df_train, test_size=.20, shuffle=True, random_state=41)
df_train.shape, df_val.shape

In [None]:
class PatentDataset(Dataset):
    def __init__(self, tokenizer, dataset, max_length, export=False):
        """
        Args:
            tokenizer: The tokenizer to be used.
            dataset: The dataset to be used.
            max_length: The maximum length of the input.
            export: This mode is designed for computing final results on a dataset that does not contain the target variable
        """
        super(PatentDataset, self).__init__()
        self.export = export
        self.tokenizer = tokenizer
        self.df = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        inputs = self.tokenizer(
            df_train.input.iloc[index],
            add_special_tokens=True,
            max_length=self.max_length,
            truncation_strategy='max_length',
            padding='max_length',
            truncation=True, 
            return_tensors='pt'
        )
        ids = inputs["input_ids"][0]
        mask = inputs["attention_mask"][0]

        out = dict(
            # raw=df_train.input.iloc[index],  # debug
            ids=ids.to(torch.long),
            mask=mask.to(torch.long),
        )

        if not self.export:
            out['target'] = torch.tensor(self.df.score.iloc[index], dtype=torch.float)

        return out

train_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_train, max_length=cfg['max_len'])
val_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_val, max_length=cfg['max_len'])
test_dataset = PatentDataset(tokenizer=tokenizer, dataset=df_test, max_length=cfg['max_len'], export=True)

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=cfg['train_bs'], num_workers=cfg['workers'])
val_dataloader = DataLoader(dataset=val_dataset, batch_size=cfg['valid_bs'], num_workers=cfg['workers'])
test_dataloader = DataLoader(dataset=test_dataset, batch_size=cfg['valid_bs'], num_workers=cfg['workers'], shuffle=False)

### Model

In [None]:
from pytorch_lightning.utilities.types import STEP_OUTPUT


class Model(pl.LightningModule):
    def __init__(self, num_classes, num_train_optimization_steps,
                 deeper_layer_to_train=11,
                 learning_rate=1e-5, warmup_steps=0, weight_decay=0.01, adam_epsilon=1e-08):
        super(Model, self).__init__()
        self.config = AutoConfig.from_pretrained(cfg['model'], output_hidden_states=True)
        # pretrained_cfg.num_labels = 1
        
        self.pretrained_model = AutoModel.from_config(config=self.config)
        # self.classifier = nn.Linear(self.pretrained_model.config.hidden_size * MAX_LENGTH, num_classes)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        # self.configure_trained_layers(deeper_layer_to_train)
        self._init_weights(self.attention)
        self.save_hyperparameters()
        
        # debug
        self.total_true = np.array([])
        self.total_pred = np.array([])
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def configure_trained_layers(self, deeper_layer_to_train, verbose=1):
        requires_grad = False
        print(f'deeper_layer_to_train: {deeper_layer_to_train}')
        for param in self.pretrained_model.named_parameters():
            if f'encoder.layer.{deeper_layer_to_train}' in param[0]:
                requires_grad = True
            param[1].requires_grad = requires_grad
            if verbose == 2 or (verbose == 1 and requires_grad):
                print(f'layer {param[0]} is {"NOT " if requires_grad is False else ""}trained.')

    def forward(self, ids, mask):
        out = self.pretrained_model(ids, attention_mask=mask, return_dict=False)
        # out = torch.relu(self.classifier(out[0].view(out[0].size(0), -1)))
        weights = self.attention(out[0])
        feature = torch.sum(weights * out[0], dim=1)
        return self.fc(feature).view(-1)

    def configure_optimizers(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.hparams.weight_decay},
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        
        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        print(f'current learning rate: {self.hparams.learning_rate}')
        # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
        # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.hparams.num_train_optimization_steps, verbose=True)
        
        scheduler = transformers.get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.num_train_optimization_steps
        )
        scheduler = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}
        
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx) -> STEP_OUTPUT:
        loss = self._common_step(batch, batch_idx, 'train')
        return loss

    def validation_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'val')

    def _common_step(self, batch, batch_idx, stage: str):
        ids, label, mask = self._prepare_batch(batch)
        output = self(ids=ids, mask=mask)

        # loss = F.cross_entropy(output, label)
        # loss = F.mse_loss(output, label)
        loss = F.binary_cross_entropy_with_logits(output, label)
        # y_true = torch.argmax(label, dim=-1).view(-1)
        # y_pred = torch.argmax(output, dim=-1).view(-1)
        
        if stage == 'train':
            self.total_true = np.concatenate([self.total_true, label.view(-1).clone().cpu().detach().numpy()])
            self.total_pred = np.concatenate([self.total_pred, output.view(-1).clone().cpu().detach().numpy()])

        if len(self.total_true) >= 1200 and stage == 'train':
            pearson = np.round(sp.stats.pearsonr(self.total_true, self.total_pred)[0], 3)
            print('batched_pearson', pearson)
            self.log(f"batched_pearson", pearson, on_step=True, prog_bar=True)
            self.total_true = np.array([])
            self.total_pred = np.array([])
        
        y_true = torch.round(label.cpu().detach() * 4).to(int)
        y_pred = torch.min(torch.max(torch.round(output.cpu().detach() * 4).to(int), torch.tensor(0)), torch.tensor(4))

        self._log_metrics(loss, y_true, y_pred, label.cpu().detach(), stage)
        return loss

    def _log_metrics(self, loss, y_true, y_pred, raw_y_true, stage: str) -> None:
        # Compute metrics
        acc = (y_true == y_pred).float().mean()
        pearson = np.round(sp.stats.pearsonr(y_true, y_pred)[0], 3)

        # Log metrics
        self.log(f"Loss/{stage}", loss, on_step=True, prog_bar=True)
        self.log(f"Accuracy/{stage}", acc, on_step=True, prog_bar=True)
        self.log(f"Pearson/{stage}", pearson, on_step=True, prog_bar=True)
        
        if stage == 'val':  # do classification report
            classification_report = metrics.classification_report(y_true, y_pred, digits=2, output_dict=True)
            macro_precision, macro_recall, macro_f1, _ = classification_report['macro avg'].values()
            weighted_precision, weighted_recall, weighted_f1, _ = classification_report['weighted avg'].values()
            self.log(f'Macro/{stage}', {'precision': macro_precision, 'recall': macro_recall, 'f1': macro_f1}, prog_bar=False, on_step=True)
            self.log(f'Weighted/{stage}', {'precision': weighted_precision, 'recall': weighted_recall, 'f1': weighted_f1}, prog_bar=False, on_step=True)


    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = 0):
        ids, _, mask = self._prepare_batch(batch, include_target=False)
        output = self(ids=ids, mask=mask)
        print(f'predict {output.shape}')
        return torch.argmax(output, dim=-1)

    def test_step(self, batch, batch_idx):
        self._common_step(batch, batch_idx, 'test')

    def _prepare_batch(self, batch, include_target=True):
        ids = batch['ids']
        mask = batch['mask']
        if not include_target:
            return ids, None, mask
        label = batch['target']
        return ids, label, mask


In [None]:
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torch import nn

# Add the WandbLogger to your PyTorch Lightning Trainer
from pytorch_lightning.callbacks import LearningRateMonitor
import logging
from logging import WARNING
logging.basicConfig(level=WARNING)

#import wandb
#wandb.login(key='70c6395b4eb3a3d517ea0020904b1e5ae8c8ad0c')


early_stop_callback = EarlyStopping(monitor="Loss/val", min_delta=1e-4, patience=3, verbose=False, mode="min")
lr_logger = LearningRateMonitor(logging_interval='step')
# wb_logger = WandbLogger(name=f"{cfg['model']}_local", save_dir='./logs/WB_logs', offline=False, project='PPPM', tags=["deberta_fine_tune", cfg['model']], config=cfg)
tb_logger = TensorBoardLogger(save_dir="./logs/TensorBoard_logs", name="DeBERTa_fine-tuned")


trainer = pl.Trainer(
    accelerator='gpu',
    gradient_clip_val=cfg['gradient_clip_val'],
    auto_lr_find=False,
    callbacks=[lr_logger, early_stop_callback],
    #logger=[tb_logger, wb_logger],
    logger=[tb_logger],
    weights_summary="top",
    max_epochs=1,
    accumulate_grad_batches=cfg['accumulate_grad_batches'],
    precision=16,
    amp_backend="native"
)

hparams = dict(
    num_classes=1, # df_train.score.unique().size,
    deeper_layer_to_train=11,  # Not used
    num_train_optimization_steps=len(train_dataloader),
    learning_rate=cfg['learning_rate']
)

checkpoint = None
if checkpoint is not None:
    model = Model.load_from_checkpoint(checkpoint, **hparams)
    print(f'Checkpoint {checkpoint} loaded')
else:
    model = Model(**hparams)


# 70c6395b4eb3a3d517ea0020904b1e5ae8c8ad0c

In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import scipy as sp
from scipy.stats import PearsonRConstantInputWarning

warnings.filterwarnings('ignore', category=PearsonRConstantInputWarning)

In [None]:
# trainer.validate(model, val_dataloader)

In [None]:
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
# 10.2go 12.9go

In [None]:
trainer.validate(model, val_dataloader)

In [None]:
#results = trainer.predict(model, test_dataloader)

#df_test['y_pred'] = np.concatenate(results)
#df_test