In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import pickle
import gc

import torch
import torch.nn as nn

from scipy.stats import pearsonr
from sklearn.model_selection import KFold, GroupKFold
from transformers import AutoConfig, AutoTokenizer, AutoModel

In [None]:
%%time
def read_pickle(filepath):
    with open(filepath, 'rb') as file:
        obj = pickle.load(file)
    return obj


cls_map = read_pickle("../input/upppmunlabeled-dataset/cls_map.pkl")
main_group_map = read_pickle("../input/upppmunlabeled-dataset/main_group_map.pkl")
subcls_map = read_pickle("../input/upppmunlabeled-dataset/subcls_map.pkl")

del main_group_map['']

unlabel_df = pd.read_csv("../input/upppmunlabeled-dataset/unlabled_titles.csv")
unlabel_df.head()

In [None]:
%%time
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
title_df = pd.read_csv("../input/cpc-codes/titles.csv")

test_df = test_df.merge(title_df, how='inner', left_on='context', right_on='code')
test_df['text'] = test_df['anchor'] + '[SEP]' + test_df['target'] + '[SEP]' + test_df['title']

test_df.head()

In [None]:
%%time
title_map = {}
for _,row in title_df.iterrows():
    title_map[row.code] = row.title
print(len(title_map))

In [None]:
context2id = {}
for i, context in enumerate(cls_map.keys()):
    context2id[context]=i

# Config

In [None]:
class CFG:
    batch_size=24
    n_epochs = 5
    model_name = "microsoft/deberta-base"
    max_len = 200
    nfolds = 5
    min_lr = 1e-6
    max_lr = 2e-5
    weight_decay=0.01

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Dataset

In [None]:
class PatentDataset(torch.utils.data.Dataset):
    def __init__(self, df, model_name='deberta_large_v1', phase='train', tokenizer=None):
        self.phase = phase
        self.model_name= model_name
        self.tokenizer = tokenizer
        
        self.ids = df['id'].values
        self.anchor = df.anchor.values
        self.target = df.target.values
        self.context = df.context.values
        
        self.text = df.text.values
    
    def prepare_inputs(self, text):
        inputs = self.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length')
        for k,v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def __getitem__(self, idx):
        test_id = self.ids[idx]
        context = self.context[idx]
        
        if self.model_name == 'deberta_large_v1':
            text = self.anchor[idx]+"[SEP]"+self.target[idx]
        else:
            text = self.anchor[idx]+"[SEP]"+self.target[idx]+"[SEP]"+title_map[context]
        
        context_id = torch.tensor( context2id[context],  dtype=torch.long)
        inputs = self.prepare_inputs(text)
        return (test_id, context_id, inputs)
    
    def __len__(self):
        return len(self.text)

# Model

In [None]:
class BackboneModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(CFG.model_name)
        self.attention = nn.Sequential(
            nn.Linear(model_config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
    
    def forward(self, inputs, model_name=None):
        outputs = self.backbone(**inputs)
        if model_name == 'deberta_large_v1':
            return outputs.last_hidden_state[:, 0, :]
        
        last_hidden_state = outputs[0]
        weights = self.attention(last_hidden_state)
        h = torch.sum(weights * last_hidden_state, dim=1)
        return h

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        
        self.context_embeddings = nn.Embedding(len(context2id), model_config.hidden_size)
        #task specific layers.
        self.link_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        
        self.strength_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 3)
        )
        
        self.strength_prediction5 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 5)
        )
        
        self.context_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, len(context2id))
        )
        self.mlp1 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        
        self.mlp2 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        self.mlp3 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(2*model_config.hidden_size, 256),
            nn.LeakyReLU(),
            
            nn.BatchNorm1d(256),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )
        
    
    def forward(self, context_id, inputs, model_name):
        hcontext = self.context_embeddings(context_id)
        hinputs = self.backbone(inputs, model_name)
        h = torch.cat([hinputs, hcontext], dim=-1)
        
        y1 = self.mlp1(h)
        y2 = self.mlp2(h)
        y3 = self.mlp3(h)
        y = (y1+y2+y3)/3
        
        ylink = self.link_prediction(h)
        ystrength = self.strength_prediction(h)
        ystrength5 = self.strength_prediction5(h)
        ycontext = self.context_prediction(hinputs)
        
        return (y, ylink, ystrength, ystrength5, ycontext)

In [None]:
class ModelV2(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        #task specific layers.
        self.link_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        self.strength_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 3)
        )
        
        self.strength_prediction5 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 5)
        )
        
        self.context_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, len(context2id))
        )
        self.mlp = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        
        self.strength_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.strength_prediction5[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.context_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.mlp[1].weight.data.normal_(mean=0.0, std = 1/np.sqrt(model_config.hidden_size))
        
    def forward(self, context_id, inputs, model_name):
        h = self.backbone(inputs, model_name)
        y = self.mlp(h)
        
        ylink = self.link_prediction(h)
        ystrength = self.strength_prediction(h)
        ystrength5 = self.strength_prediction5(h)
        ycontext = self.context_prediction(h)
        
        return (y, ylink, ystrength, ystrength5, ycontext)

In [None]:
class DebertaModelV3(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        #task specific layers.
        self.link_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        self.strength_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 3)
        )
        
        self.strength_prediction5 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 5)
        )
        
        self.context_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, len(context2id))
        )
        self.mlp = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        
        self.strength_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.strength_prediction5[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.context_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.mlp[1].weight.data.normal_(mean=0.0, std = 1/np.sqrt(model_config.hidden_size))
        
    def forward(self, context_id, inputs, model_name):
        h = self.backbone(inputs, model_name)
        y = self.mlp(h)
        
        ylink = self.link_prediction(h)
        ystrength = self.strength_prediction(h)
        ystrength5 = self.strength_prediction5(h)
        ycontext = self.context_prediction(h)
        
        return (y, ylink, ystrength, ystrength5, ycontext)

In [None]:
class RobertaModelV2(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        #task specific layers.
        self.link_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        self.strength_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 3)
        )
        
        self.strength_prediction5 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 5)
        )
        
        self.context_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, len(context2id))
        )
        self.mlp = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        
        self.strength_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.strength_prediction5[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.context_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.mlp[1].weight.data.normal_(mean=0.0, std = 1/np.sqrt(model_config.hidden_size))
        
    def forward(self, context_id, inputs, model_name=None):
        h = self.backbone(inputs, model_name)
        y = self.mlp(h)
        
        ylink = self.link_prediction(h)
        ystrength = self.strength_prediction(h)
        ystrength5 = self.strength_prediction5(h)
        
        return (y, ylink, ystrength, ystrength5, _)

In [None]:
class RobertaModelV3(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = BackboneModel()
        #task specific layers.
        self.link_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        self.strength_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 3)
        )
        
        self.strength_prediction5 = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 5)
        )
        
        self.context_prediction = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, len(context2id))
        )
        self.mlp = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(model_config.hidden_size, 1)
        )
        
        
        self.strength_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.strength_prediction5[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.context_prediction[1].weight.data.normal_(mean=0.0, std=1/np.sqrt(model_config.hidden_size))
        self.mlp[1].weight.data.normal_(mean=0.0, std = 1/np.sqrt(model_config.hidden_size))
        
    def forward(self, context_id, inputs, model_name=None):
        h = self.backbone(inputs, model_name)
        y = self.mlp(h)
        
        ylink = self.link_prediction(h)
        ystrength = self.strength_prediction(h)
        ystrength5 = self.strength_prediction5(h)
        
        return (y, ylink, ystrength, ystrength5, _)

# Inference

In [None]:
def infer(models, model_name, tokenizer):
    
    val_dataset   = PatentDataset(test_df , model_name=model_name, phase='val', tokenizer=tokenizer)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=CFG.batch_size,
                                                 shuffle=False, 
                                                 drop_last=False)
    
    
    test_ids = []
    scores =[]
    pred_strength=[]
    pred_strength5=[]
    
    
    print("number of batches:",len(val_dataloader))
    for (test_id, context_id, inputs) in val_dataloader:
        context_id = context_id.to(device)
        batch_max_seqlen = inputs['attention_mask'].sum(dim=-1).max()
        for k,v in inputs.items():
            v = v[:, :batch_max_seqlen]
            inputs[k] = v.to(device)
        
        test_ids += test_id
        enesembled_scores = np.zeros(len(test_id))
        enesembled_strength = np.zeros((len(test_id), 3))
        enesembled_strength5 = np.zeros((len(test_id), 5))
        
        for model in models:
            model.eval()
            with torch.no_grad():
                (score, yhat_link, yhat_strength, yhat_strength5, _) = model(context_id, inputs, model_name)
                score = score.view(-1).sigmoid().cpu().numpy()
                yhat_strength = yhat_strength.softmax(dim=-1).cpu().numpy()
                yhat_strength5 = yhat_strength5.softmax(dim=-1).cpu().numpy()
                
                enesembled_scores+=score
                enesembled_strength += yhat_strength
                enesembled_strength5 += yhat_strength5
        
        
        enesembled_scores = enesembled_scores/len(models)
        enesembled_strength =enesembled_strength/len(models)
        enesembled_strength5 = enesembled_strength5/len(models)
        
        scores += enesembled_scores.tolist()
        pred_strength += enesembled_strength.tolist()
        pred_strength5 += enesembled_strength5.tolist()
        
    submission_df = pd.DataFrame.from_dict({
        'id' : test_ids,
        model_name+'_score' : scores,
        model_name+'_pred_strength' : pred_strength,
        model_name+'_pred_strength5' : pred_strength5
    })
    return submission_df
    

In [None]:
roberta_large_tokenizer=AutoTokenizer.from_pretrained("../input/deberta-huggingface-tokenizers/roberta-large.pt")
roberta_large_models = [
    torch.load("../input/usppm-roberta-v1-fold0-model/model_0.pt", map_location=device),
    torch.load("../input/usppm-roberta-v1-fold1-model/model_1.pt", map_location=device),
    torch.load("../input/usppm-roberta-v1-fold2-model/model_2.pt", map_location=device),
    torch.load("../input/usppm-roberta-v1-fold3-model/model_3.pt", map_location=device),
    torch.load("../input/usppm-roberta-v1-fold4-model/model_4.pt", map_location=device)
]

sub_roberta_v1 = infer(roberta_large_models,'roberta_large_v1', roberta_large_tokenizer)

del roberta_large_models
gc.collect()

sub_roberta_v1.head()

In [None]:
roberta_large_tokenizer=AutoTokenizer.from_pretrained("../input/deberta-huggingface-tokenizers/roberta-large.pt")
roberta_large_v3_models = [
    torch.load("../input/usppm-roberta-v3-fold0-model/model_0.pt", map_location=device),
    torch.load("../input/usppmrobertav3fold1model/model_1.pt", map_location=device),
    torch.load("../input/usppmrobertav3fold2model/model_2.pt", map_location=device),
    torch.load("../input/usppmrobertav3fold3model/model_3.pt", map_location=device),
    torch.load("../input/usppmrobertav3fold4model/model_4.pt", map_location=device)
]

sub_roberta_v3 = infer(roberta_large_v3_models,'roberta_large_v3', roberta_large_tokenizer)

del roberta_large_v3_models
gc.collect()

sub_roberta_v3.head()

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained("../input/deberta-huggingface-tokenizers/deberta-v3-large-tokenizer.pt")
deberta_large_models_v1 = [
    torch.load("../input/upmm-deberta-large-run1-fold0-model/model_0.pt", map_location=device),
    torch.load("../input/usppm-deberta-run1-fold12/model_1.pt", map_location=device),
    torch.load("../input/usppm-debertalarge-run2-fold2-model/model_2.pt", map_location=device),
    torch.load("../input/usppm-debertalarge-run1-fold34-models/model_3.pt", map_location=device),
    torch.load("../input/usppm-debertalarge-run2-fold4-model/model_4.pt",  map_location=device)
]

sub_deberta_v1 = infer(deberta_large_models_v1, "deberta_large_v1", deberta_tokenizer)
del deberta_large_models_v1
gc.collect()

sub_deberta_v1.head()

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained("../input/deberta-huggingface-tokenizers/deberta-v3-large-tokenizer.pt")
deberta_v3_large_models_v2 = [
    torch.load("../input/uspppmdebertaattention-v2-fold0/model_0.pt", map_location=device),
    torch.load("../input/uspppmdebertaattention-v2-fold1/model_1.pt", map_location=device),
    torch.load("../input/uspppmdebertaattentionv2fold2/model_2.pt", map_location=device),
    torch.load("../input/uspppmdebertaattention-v2-fold3/model_3.pt", map_location=device),
    torch.load("../input/uspppmdebertaattention-v2-fold4/model_4.pt",  map_location=device)
]

sub_deberta_v2 = infer(deberta_v3_large_models_v2,'deberta_large_v2', deberta_tokenizer)

del deberta_v3_large_models_v2
gc.collect()

sub_deberta_v2.head()

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained("../input/deberta-huggingface-tokenizers/deberta-v3-large-tokenizer.pt")
deberta_v3_large_models_v3 = [
    torch.load("../input/uspppm-deberta-attention-v3-fold0-model/model_0.pt", map_location=device),
    torch.load("../input/uspppmdebertaattentionv3fold1model/model_1.pt", map_location=device),
    torch.load("../input/uspppmdebertaattentionv2fold2/model_2.pt", map_location=device),
    torch.load("../input/uspppmdebertaattention-v2-fold3/model_3.pt", map_location=device),
    torch.load("../input/uspppmdebertaattention-v2-fold4/model_4.pt",  map_location=device)
]

sub_deberta_v3 = infer(deberta_v3_large_models_v3,'deberta_large_v3', deberta_tokenizer)

del deberta_v3_large_models_v3
gc.collect()

sub_deberta_v3.head()

In [None]:
submission_df = sub_roberta_v1.merge(sub_roberta_v3, on='id').merge(sub_deberta_v2, on='id').merge(sub_deberta_v1, on='id').merge(sub_deberta_v3, on='id')
submission_df.head()


# blend scores

In [None]:
[colname for colname in submission_df.columns if '_score' in colname]

In [None]:
#roberta_score = 0.49*submission_df['roberta_large_v1_score'] + 0.51*submission_df['roberta_large_v3_score']
#deberta_score = 0.44*submission_df['deberta_large_v1_score'] + 0.56*submission_df['deberta_large_v2_score']

#score = 0.275 * roberta_score + 0.725 * deberta_score
pred_scores = 0.2344878  * submission_df['deberta_large_v1_score'] \
        + 0.3483222 * submission_df['deberta_large_v2_score'] \
        + 0.19213345 * submission_df['deberta_large_v3_score'] \
        + 0.12141794 * submission_df['roberta_large_v1_score'] \
        + 0.10363861 * submission_df['roberta_large_v3_score']


submission_df['score'] = pred_scores
submission_df.head()

In [None]:
def postprocess1(row):
    blend_score = row.score
    
    roberta_large_v1_pred_strength5 = np.array(row.roberta_large_v1_pred_strength5)
    roberta_large_v3_pred_strength5 = np.array(row.roberta_large_v3_pred_strength5)
    
    deberta_large_v1_pred_strength5 = np.array(row.deberta_large_v1_pred_strength5)
    deberta_large_v2_pred_strength5 = np.array(row.deberta_large_v2_pred_strength5)
    deberta_large_v3_pred_strength5 = np.array(row.deberta_large_v3_pred_strength5)
    
    pred_strength5 = (roberta_large_v1_pred_strength5 \
                      + roberta_large_v3_pred_strength5 \
                      + deberta_large_v1_pred_strength5 \
                      + deberta_large_v2_pred_strength5 \
                      + deberta_large_v3_pred_strength5
                     )/5
    
    pred_cls = np.argmax(pred_strength5)
    pred_cls_value = np.max(pred_strength5)
    
    if pred_cls_value < 0.9:
        return blend_score
    
    if pred_cls == 0 and blend_score <= 0.02:
        blend_score=0.0
    if pred_cls == 1 and blend_score>=0.22 and blend_score <=0.28:
        blend_score = 0.25
    elif pred_cls == 2 and blend_score>=0.48 and blend_score <=0.52:
        blend_score = 0.5
    elif pred_cls == 3 and blend_score >=0.73 and blend_score <= 0.77:
        blend_score = 0.75
    elif pred_cls == 4 and blend_score >=0.97:
        blend_score=1.0
    
    return blend_score

In [None]:
def postprocess2(row):
    blend_score = row.score
    
    roberta_large_v1_pred_strength = np.array(row.roberta_large_v1_pred_strength)
    roberta_large_v3_pred_strength = np.array(row.roberta_large_v3_pred_strength)
    
    deberta_large_v1_pred_strength = np.array(row.deberta_large_v1_pred_strength)
    deberta_large_v2_pred_strength = np.array(row.deberta_large_v2_pred_strength)
    deberta_large_v3_pred_strength = np.array(row.deberta_large_v3_pred_strength)
    
    pred_strength = (roberta_large_v1_pred_strength \
                     + roberta_large_v3_pred_strength \
                     + deberta_large_v1_pred_strength \
                     + deberta_large_v2_pred_strength\
                     + deberta_large_v3_pred_strength
                    )/5
    
    pred_cls = np.argmax(pred_strength)
    pred_cls_value = np.max(pred_strength)
    
    if pred_cls_value < 0.9:
        return blend_score
    
    if pred_cls == 0 and blend_score <= 0.02:
        blend_score=0.0
    if pred_cls == 1 and blend_score>=0.22 and blend_score <=0.28:
        blend_score = 0.25
    elif pred_cls == 1 and blend_score>=0.48 and blend_score <=0.52:
        blend_score = 0.5
    elif pred_cls == 2 and blend_score >=0.73 and blend_score <= 0.77:
        blend_score = 0.75
    elif pred_cls == 2 and blend_score >=0.97:
        blend_score=1.0
    
    return blend_score

In [None]:
submission_df['score1'] = submission_df.apply(postprocess1, axis=1)
submission_df['score2'] = submission_df.apply(postprocess2, axis=1)
submission_df['score'] = 0.6 * submission_df['score1'] + 0.4 * submission_df['score2']

submission_df.head()

In [None]:
submission_df[['id', 'score']].to_csv("submission.csv", index=False)