In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchmetrics import MeanSquaredError
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning import Callback
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
data = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test_data = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
data.shape, test_data.shape

In [None]:
data.head()

In [None]:
codes = pd.read_csv('../input/cpc-codes/titles.csv')
codes = codes.rename(columns = {"code" : "context"})
codes.head()

In [None]:
train_data=pd.merge(data,codes[["context","title"]],on="context",how="left")
test_data=pd.merge(test_data,codes[["context","title"]],on="context",how="left")

In [None]:
train_data.head()

## Configuration Class

In [None]:
class CFG:
    val_size = 0.20
    max_len = 192
    model_name = '../input/bert-for-patent/bert-for-patents'
    batch_size = 16
    epochs = 5
    lr = 2e-5
    max_lr = 1e-3
    steps_per_epoch = None
    pct_start = 0.3
    div_factor = 1e+2
    final_div_factor = 1e+4
    accumulate = 1
    patience = 3
    monitor = 'val_loss'
    seed = 42
    debug = False
    dropout = 0.2

## Dataset

In [None]:
class PhraseSimilarityDataset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
        self.tokenizer_params = {
            'max_length' : CFG.max_len,
            'padding' : 'max_length',
            'truncation' : True
        }
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()
        title = self.df.title.iloc[index].lower()
        
        tokens = self.tokenizer(anchor + '[SEP]' + target + '[SEP]' + title, **self.tokenizer_params)
        
        score = torch.tensor(self.df.score.iloc[index], dtype=torch.float32)
        
        return (
            np.array(tokens['input_ids']),
            np.array(tokens['attention_mask']),
            score
        )
    
class PhraseSimilarityTestset(Dataset):
    
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
        self.tokenizer_params = {
            'max_length' : CFG.max_len,
            'padding' : 'max_length',
            'truncation' : True
        }
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        anchor = self.df.anchor.iloc[index].lower()
        target = self.df.target.iloc[index].lower()
        title = self.df.title.iloc[index].lower()
        
        tokens = self.tokenizer(anchor + '[SEP]' + target + '[SEP]' + title, **self.tokenizer_params)
        
        return (
            np.array(tokens['input_ids']),
            np.array(tokens['attention_mask']),
        )

## Model Class

### Model Architecture

https://www.kaggle.com/code/tianzijing/pppm-train-deberta-large-baseline-with-pl

In [None]:
class PhraseSimilarityModelImpl(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        self.model_config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(model_name, config=self.model_config)
        self.bert = AutoModel.from_pretrained(model_name)
        self.head = nn.Linear(1024, 1, bias=True)
        
        self.attention = nn.Sequential(
            nn.Linear(self.model_config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        
        self.dropout_0 = nn.Dropout(CFG.dropout / 2.)
        self.dropout_1 = nn.Dropout(CFG.dropout / 1.5)
        self.dropout_2 = nn.Dropout(CFG.dropout)
        self.dropout_3 = nn.Dropout(CFG.dropout * 1.5)
        self.dropout_4 = nn.Dropout(CFG.dropout * 2.)
    
    def forward(self, text, mask):
        feats = self.bert(text, mask)
        last_hidden_states, pooler_output = feats[0], feats[1]
        weights = self.attention(last_hidden_states)
#         feats = torch.sum(feats[0], 1)/feats[0].shape[1]
        feats = torch.sum(weights * last_hidden_states, dim=1)
        output_0 = self.head(self.dropout_0(feats))
        output_1 = self.head(self.dropout_1(feats))
        output_2 = self.head(self.dropout_2(feats))
        output_3 = self.head(self.dropout_3(feats))
        output_4 = self.head(self.dropout_4(feats))
        return (output_0 + output_1 + output_2 + output_3 + output_4) / 5

class PhraseSimilarityModel(pl.LightningModule):
    
    def __init__(self, model, criterion, metric):
        super(PhraseSimilarityModel, self).__init__()
        self.model = model
        self.criterion = criterion
        self.metric = metric
        
    def forward(self, text, mask):
        return self.model(text, mask)
    
    def configure_optimizers(self):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=CFG.lr)
        return self.optimizer
    
    def training_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        loss = self.criterion(preds.squeeze(1), batch[2])
        rmse = self.metric(preds.squeeze(1), batch[2])
        logs = {'train_loss': loss, 'train_error': rmse, 'lr': self.optimizer.param_groups[0]['lr']}
        self.log_dict(logs, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        loss = self.criterion(preds.squeeze(1), batch[2])
        rmse = self.metric(preds.squeeze(1), batch[2])
        logs = {'val_loss': loss, 'val_error': rmse}
        self.log_dict(logs, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def predict_step(self, batch, batch_idx):
        ids, mask = batch[0], batch[1]
        preds = self.model(ids, mask)
        return preds
        

In [None]:
if CFG.debug == True:
    train_data = train_data.iloc[:200]
    
scores = train_data.score.values
train_data.drop('score', inplace=True, axis=1)
train_data, val_data, train_labels, val_labels = train_test_split(train_data, scores, 
                                                                  stratify=scores, 
                                                                  test_size=CFG.val_size, 
                                                                  random_state=CFG.seed)
train_data['score'] = train_labels
val_data['score'] = val_labels

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
train_dataset = PhraseSimilarityDataset(train_data, tokenizer)
val_dataset = PhraseSimilarityDataset(val_data, tokenizer)
test_dataset = PhraseSimilarityTestset(test_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [None]:
CFG.steps_per_epoch = len(train_dataloader)
CFG.steps_per_epoch

In [None]:
logger = CSVLogger(save_dir='./', name=CFG.model_name.split('/')[-1]+'_log')
logger.log_hyperparams(CFG.__dict__)
checkpoint_callback = ModelCheckpoint(monitor=CFG.monitor,
                                      save_top_k=1,
                                      save_last=True,
                                      save_weights_only=True,
                                      filename='{epoch:02d}-{valid_loss:.4f}-{valid_acc:.4f}',
                                      verbose=False,
                                      mode='min')
early_stop_callback = EarlyStopping(monitor=CFG.monitor, 
                                    patience=CFG.patience, 
                                    verbose=False, 
                                    mode="min")

trainer = Trainer(
    max_epochs=CFG.epochs,
    gpus=[0],
    accumulate_grad_batches=CFG.accumulate,
    callbacks=[checkpoint_callback, early_stop_callback], 
    logger=logger,
    weights_summary='top',
)

In [None]:
model = PhraseSimilarityModelImpl(CFG.model_name)
criterion = nn.HuberLoss(reduction='mean', delta=1.0)
metric = MeanSquaredError()
driver = PhraseSimilarityModel(model, criterion, metric)

trainer.fit(driver, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

In [None]:
metrics = pd.read_csv(f'{trainer.logger.log_dir}/metrics.csv')

train_acc = metrics['train_error'].dropna().reset_index(drop=True)
valid_acc = metrics['val_error'].dropna().reset_index(drop=True)
    
fig = plt.figure(figsize=(7, 6))
plt.grid(True)
plt.plot(train_acc, color="r", marker="o", label='train/error')
plt.plot(valid_acc, color="b", marker="x", label='valid/error')
plt.ylabel('Error', fontsize=24)
plt.xlabel('Epoch', fontsize=24)
plt.legend(loc='lower right', fontsize=18)
plt.savefig(f'{trainer.logger.log_dir}/acc.png')

train_loss = metrics['train_loss'].dropna().reset_index(drop=True)
valid_loss = metrics['val_loss'].dropna().reset_index(drop=True)

fig = plt.figure(figsize=(7, 6))
plt.grid(True)
plt.plot(train_loss, color="r", marker="o", label='train/loss')
plt.plot(valid_loss, color="b", marker="x", label='valid/loss')
plt.ylabel('Loss', fontsize=24)
plt.xlabel('Epoch', fontsize=24)
plt.legend(loc='upper right', fontsize=18)
plt.savefig(f'{trainer.logger.log_dir}/loss.png')\

lr = metrics['lr'].dropna().reset_index(drop=True)

fig = plt.figure(figsize=(7, 6))
plt.grid(True)
plt.plot(lr, color="g", marker="o", label='learning rate')
plt.ylabel('LR', fontsize=24)
plt.xlabel('Epoch', fontsize=24)
plt.legend(loc='upper right', fontsize=18)
plt.savefig(f'{trainer.logger.log_dir}/lr.png')

In [None]:
predictions = trainer.predict(dataloaders=test_dataloader)

In [None]:
preds = []
for batch in predictions:
    preds += batch.squeeze(1).tolist()

submission_csv = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
submission_csv['score'] = preds
submission_csv.head()

In [None]:
submission_csv.to_csv('submission.csv', index=False)