In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from scipy.stats import pearsonr
from sklearn.model_selection import StratifiedKFold
from transformers import AutoConfig, AutoTokenizer, AutoModel

In [None]:
%%time
train_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
title_df = pd.read_csv("../input/cpc-codes/titles.csv")

train_df = train_df.merge(title_df, how='inner', left_on='context', right_on='code')
train_df['text'] = train_df['anchor'] + '[SEP]' + train_df['target'] + '[SEP]' + train_df['title']
train_df.head()


# generate auxilary data

In [None]:
all_train_contexts = train_df.context.unique()
context_map = {}
for _, row in title_df.iterrows():
    code =row.code
    title=row.title
    context_map[code] = title
    

title_group_df = title_df[['code', 'title']].copy()
title_group_df['context'] = title_group_df['code'].apply(lambda x: x[:3])
title_group_df = title_group_df[(title_group_df['context'].apply(lambda x: len(x)==3)) &
                                (title_group_df['code'] != title_group_df['context']) &
                                (title_group_df['context'].isin(all_train_contexts))].copy()

title_group_df.rename(columns={'title': "pos_title"}, inplace=True)
title_group_df['title'] = title_group_df['context'].apply(lambda k: context_map[k])
title_group_df['section'] = title_group_df.code.apply(lambda code: code[0])
title_group_df = title_group_df.groupby('context').head(1000)

title_group_df.head()

In [None]:
all_context_map={}

for _,row in title_group_df.iterrows():
    context=row.context
    pos_title=row.pos_title
    
    if context not in all_context_map:
        all_context_map[context]=[]
    
    all_context_map[context].append(pos_title)
print(len(all_context_map))

In [None]:
def get_fold_map(row):
    score = str(row.score)
    section = row.section
    return section+'-'+score

train_df['fold_group'] = train_df.apply(get_fold_map, axis=1)
train_df.head()

# Config

In [None]:
class CFG:
    batch_size=8
    n_epochs = 1
    model_name = "microsoft/deberta-v3-large"
    max_len = 200
    nfolds = 5
    min_lr = 1e-6
    max_lr = 5e-6
    weight_decay=0.01
    ACC_STEPS=2
    eval_every = 800
    print_every = 800

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
model_config = AutoConfig.from_pretrained(CFG.model_name)

In [None]:
print(tokenizer)
print()
print(model_config)

# Dataset

In [None]:
title_group_df.head(2)


In [None]:
class TitleGroupDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.all_context = list(set(df.context.values))
    
    def prepare_inputs(self, text):
        inputs = tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', truncation=True)
        for k,v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        title = row.title
        pos_title = row.pos_title
        section = row.section
        context = row.context
        neg_context = np.random.choice(self.all_context, 2)
        if neg_context[0]!=context:
            neg_context = neg_context[0]
        else:
            neg_context = neg_context[1]
        neg_title = np.random.choice(all_context_map[ neg_context ])
        
        pos_text = title+"[SEP]"+pos_title
        neg_text = title+"[SEP]"+neg_title
        
        xpos = self.prepare_inputs(pos_text)
        xneg = self.prepare_inputs(neg_text)
        
        return (xpos, xneg)
        
    def __len__(self):
        return len(self.df)

In [None]:
class PatentDataset(torch.utils.data.Dataset):
    def __init__(self, df, phase='train'):
        self.phase = phase
        self.context = df.context.values
        self.text = df.text.values
        self.score = df.score.values
    
    def prepare_inputs(self, text):
        inputs = tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', truncation=True)
        for k,v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def __getitem__(self, idx):
        text = self.text[idx]
        score = self.score[idx]
        
        if self.phase == 'train':
            score = score + np.random.uniform(-0.025, 0.025)
            score = np.clip(score, 0.0, 1.0)
        
        inputs = self.prepare_inputs(text)
        label = torch.tensor(score, dtype=torch.float32)
        return (inputs, label)
    
    def __len__(self):
        return len(self.text)

# Model

In [None]:
class BackboneModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(CFG.model_name)
    def forward(self, inputs):
        outputs = self.backbone(**inputs)
        h = outputs.last_hidden_state[:, 0, :]
        return h

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        self.auxilary_mlp = nn.Sequential(
            nn.Dropout(0.15),
            nn.Linear(model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        self.mlp = nn.Sequential(
            nn.Dropout(0.15),
            nn.Linear(model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
    
    
    def get_auxilary_output(self, inputs):
        h = self.backbone(inputs)
        y = self.auxilary_mlp(h)
        return y
    
    def forward(self, inputs):
        h = self.backbone(inputs)
        y = self.mlp(h)
        return y

# loss

In [None]:
def compute_loss(y, yhat):
    loss = torch.tensor(0.0, device=device)
    loss = -y * torch.log( 1e-9 + yhat.sigmoid() ) - (1-y) * torch.log(1e-9 + 1 - yhat.sigmoid())
    loss = loss.mean()
    return loss

# evaluate

In [None]:
def evaluate(val_dataloader, model):
    print("evaluating...")
    ytrue=[]
    ypreds=[]
    
    model.eval()
    for it, (inputs, label) in enumerate(val_dataloader):
        batch_max_seqlen = inputs['attention_mask'].sum(dim=-1).max()
        for k,v in inputs.items():
            v = v[:, :batch_max_seqlen]
            inputs[k] = v.to(device)

        label = label.tolist()
        ytrue += label
        with torch.no_grad():
            yhat = model(inputs)
            yhat = yhat.view(-1).sigmoid().cpu().tolist()
            ypreds+=yhat
    (corr, _) = pearsonr(ytrue, ypreds)
    return corr

# train epoch

In [None]:
def get_group_train_ops(lam_, group_iterator, model):
    model.train()
    #Iterate over auxilary dataset to get auxilary loss
    loss2 = 0.0
    num_iters = 3
    for _ in range(num_iters):
        try:
            pos_samples, neg_samples = next(group_iterator)
            batch_pos_max_seqlen = pos_samples['attention_mask'].sum(dim=-1).max()
            for k,v in pos_samples.items():
                v = v[:, :batch_pos_max_seqlen]
                pos_samples[k] = v.to(device)
            
            ypos = model(pos_samples)
            ypos = ypos.view(-1)
            pos_loss = -0.9 * torch.log(1e-9 + ypos.sigmoid()) - 0.1 * torch.log(1e-9 + 1 - ypos.sigmoid())
            pos_loss = lam_ * pos_loss.mean()
            pos_loss = pos_loss/2/num_iters/ CFG.ACC_STEPS
            pos_loss.backward()
            
            #Negsamples
            batch_neg_max_seqlen = neg_samples['attention_mask'].sum(dim=-1).max()
            for k,v in neg_samples.items():
                v = v[:, :batch_neg_max_seqlen]
                neg_samples[k] = v.to(device)
            
            yneg = model(neg_samples)
            yneg = yneg.view(-1)
            neg_loss = -0.1 * torch.log(1e-9 + yneg.sigmoid()) - 0.9 * torch.log(1e-9 + 1 - yneg.sigmoid())
            neg_loss = lam_ * neg_loss.mean()
            neg_loss = neg_loss/2/num_iters/ CFG.ACC_STEPS
            neg_loss.backward()
            
            loss2 += (pos_loss.item() + neg_loss.item())
        except:
            pass
    return loss2

In [None]:
def train_epoch(fold_num, train_dataloader,val_dataloader,  optimizer, schedular, model):
    epoch_loss=[]
    epoch_aux_loss=[]
    evals = []
    
    best_eval = None
    num_iterations = len(train_dataloader)
    
    for e in range(CFG.n_epochs):
        group_dataset = TitleGroupDataset(title_group_df)
        group_loader = torch.utils.data.DataLoader(group_dataset,
                                                   batch_size=CFG.batch_size,
                                                   shuffle=True,
                                                   drop_last=True)
        group_iterator = iter(group_loader)
        print("number of Group Iterations:", len(group_loader))
        if e==0:
            lam_ = 0.1
        elif e<=2:
            lam_ = 0.5
        else:
            lam_ = 0.1
        
        model.zero_grad(set_to_none=True)
        for it, (inputs, label) in enumerate(train_dataloader):
            model.train()
            
            batch_max_seqlen = inputs['attention_mask'].sum(dim=-1).max()
            for k,v in inputs.items():
                v = v[:, :batch_max_seqlen]
                inputs[k] = v.to(device)
            label = label.to(device)

            yhat = model(inputs)
            yhat = yhat.view(-1)
            
            
            loss1 = compute_loss(label, yhat)
            loss1 = loss1 / CFG.ACC_STEPS
            loss1.backward()
            
            loss2 = get_group_train_ops(lam_, group_iterator, model)
            epoch_aux_loss.append(loss2)
            
            if (1+it)%CFG.ACC_STEPS==0  or (it == num_iterations-1):
                optimizer.step()
                schedular.step()
                model.zero_grad(set_to_none=True)
            
            epoch_loss.append(loss1.item())
            if it%CFG.print_every == 0:
                print("iteration:{} | loss:{:.4f} | aux loss:{:.4f}".format(it, np.mean(epoch_loss), np.mean(epoch_aux_loss)))

            if (1+it)%CFG.eval_every == 0:
                cur_eval = evaluate(val_dataloader, model)
                evals.append(cur_eval)
                if (best_eval is None) or (cur_eval > best_eval):
                    best_eval = cur_eval
                    torch.save(model, "model_{}.pt".format(fold_num))
                print("eval:{:.4f} | best eval:{:.4f}".format(cur_eval, best_eval))
            
        cur_eval = evaluate(val_dataloader, model)
        evals.append(cur_eval)
        if (best_eval is None) or (cur_eval > best_eval):
            best_eval = cur_eval
            torch.save(model, "model_{}.pt".format(fold_num))
        
        print("eval:{:.4f} | best eval:{:.4f}".format(cur_eval, best_eval))
        print()
        print("-------------------")
        print("epoch:{} | loss:{:.4f} | auxloss:{:.4f}".format(e, np.mean(epoch_loss), np.mean(epoch_aux_loss)))
    
    print()
    print()
    plt.title("epoch losses....")
    plt.plot(epoch_loss)
    plt.show()
    print()
    
    plt.title("epoch aux losses....")
    plt.plot(epoch_aux_loss)
    plt.show()
    print()
    
    print()
    plt.title("evals")
    plt.plot(evals)
    plt.show()

In [None]:
skfold = StratifiedKFold(n_splits=CFG.nfolds, random_state=44, shuffle=True)

for foldnum, (train_idx, val_idx) in enumerate(skfold.split(train_df, train_df.fold_group)):
    fold_train_df = train_df.iloc[train_idx]
    fold_val_df = train_df.iloc[val_idx]
    
    train_dataset = PatentDataset( fold_train_df, phase='train' )
    val_dataset   = PatentDataset( fold_val_df , phase='val')
    
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size,
                                                   shuffle=True, 
                                                   drop_last=False,
                                                   pin_memory=True
                                                  )
    
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=CFG.batch_size,
                                                 shuffle=False, 
                                                 drop_last=False)

    num_train_batches = len(train_dataloader)
    print("number of train batches:", len(train_dataloader))
    print("number of val batches:", len(val_dataloader))
    print()
    print("-----------------------------------------------")
    
    #model = Model().to(device)
    model  = torch.load("../input/uspatentdebertasectiongroupmodel/model_0.pt", map_location=device)
    optimizer = torch.optim.AdamW(model.parameters(), weight_decay=CFG.weight_decay, lr=CFG.max_lr)
    schedular = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max = 2 + (num_train_batches * CFG.n_epochs)//CFG.ACC_STEPS,
                                                           eta_min  = CFG.min_lr)
    
    train_epoch(foldnum, train_dataloader, val_dataloader, optimizer, schedular, model)
    break