In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from itertools import chain

In [None]:
class config:
    device = 'cuda'
    model = "anferico/bert-for-patents"
    tokenizer =  AutoTokenizer.from_pretrained(model)
    max_len = 128
    folds = 5
    train_batch_size = 16
    valid_batch_size = 16
    epochs = 6
    lr = 2e-5


# Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, text, targets, score):
        self.text = text
        self.targets = targets
        self.score = score
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        text = self.text[item]
        targets = self.targets[item]
        score = self.score[item]
        
        encoded_text = config.tokenizer.encode_plus(text, targets, padding="max_length",
                                                    max_length=config.max_len, truncation=True,)
        
        return {
            "ids": torch.tensor(encoded_text["input_ids"], dtype=torch.long),
            "mask": torch.tensor(encoded_text["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(encoded_text["token_type_ids"], dtype=torch.long),
            "score": torch.tensor(score, dtype=torch.float),
        }

# Engine

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    train_loss = 0.0
    for data in data_loader :
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        score = data['score'].to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(
         ids=ids,
         mask=mask,
         token_type_ids=token_type_ids
         )
        
        loss = nn.BCEWithLogitsLoss()(outputs, score.view(-1, 1))
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
        train_loss +=loss.item()
    print(f'train BCE loss is {train_loss/len(data_loader)}')
        
def valid_fn(data_loader, model, device):
    model.eval()
    val_loss = 0.0
    final_score = []
    final_outputs = []
    with torch.no_grad():
        for data in data_loader :
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            score = data['score'].to(device, dtype=torch.float)

            outputs = model(
             ids=ids,
             mask=mask,
             token_type_ids=token_type_ids
             )

            val_loss += nn.BCEWithLogitsLoss()(outputs,score.view(-1, 1))
            
            score = (score.detach().cpu().numpy()).tolist()
            outputs = (torch.sigmoid(outputs).detach().cpu().numpy()).tolist()
            final_outputs.extend(outputs)
            final_score.extend(score)
            
            
    print(f"valid BCE loss : {val_loss/len(data_loader)}")
    return final_outputs,final_score  
        

# Model

In [None]:
class PhraseModel(nn.Module): 
    def __init__(self):
        super().__init__()
        
        model_config = AutoConfig.from_pretrained(config.model)  ## credits https://www.kaggle.com/code/abhishek/tez-training-phrase-matching
        model_config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(config.model, config=model_config)
        self.dropout = nn.Dropout(model_config.hidden_dropout_prob)
        self.output = nn.Linear(model_config.hidden_size, 1)
        
    def forward(self, ids, mask, token_type_ids):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        
        return output

# manage CSV file and folds

In [None]:
df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
    }

df.context = df.context.apply(lambda x: context_mapping[x[0]])
df["text"] = df.context + " " + df.anchor
df = df.drop(columns = ["context", "anchor"])

## folds from https://www.kaggle.com/code/abhishek/phrase-matching-folds

df['kfold'] = -1

df = df.sample(frac=1).reset_index(drop=True)

# bin targets
df.loc[:, "bins"] = pd.cut(
        df["score"], bins=5, labels=False
    )

kf = StratifiedKFold(n_splits=config.folds, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.bins.values)):
    print(len(train_idx), len(val_idx))
    df.loc[val_idx, 'kfold'] = fold
        
df = df.drop("bins", axis=1)


In [None]:
df.head()

## Main

In [None]:
for fold in range(config.folds):
    
    model = PhraseModel()
    model.to(config.device)
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    df_train = df_train.drop(columns = 'kfold')
    df_valid = df_valid.drop(columns = 'kfold')
    
    train_dataset = CustomDataset(text = df_train.text.values, targets = df_train.target.values, score = df_train.score.values)
    train_loader = DataLoader(train_dataset, batch_size = config.train_batch_size,shuffle=True)
    
    valid_dataset = CustomDataset(text = df_valid.text.values, targets = df_valid.target.values, score = df_valid.score.values)
    valid_loader = DataLoader(valid_dataset, batch_size = config.valid_batch_size,shuffle=False)
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01,
        },
        {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.train_batch_size * config.epochs)

    optimizer = torch.optim.AdamW(optimizer_parameters, lr=config.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps,)

    best_pearson = -999

    for epoch in range(config.epochs):
        train_fn(train_loader, model, optimizer, config.device, scheduler)
        outputs, labels = valid_fn(valid_loader, model, config.device)

        pearson_score = np.corrcoef(list(chain.from_iterable(outputs)), labels)[0][1]

        print(f"pearson Score = {pearson_score}")

        if pearson_score > best_pearson:
            torch.save(model.state_dict(), f'model-epoch{epoch}-fold-{fold}.pth')
            best_pearson = pearson_score
     
    break    

In [None]:
outputs = list(chain.from_iterable(outputs)) 
pearson_score = np.corrcoef(outputs, labels)[0][1]

print(pearson_score)