## Analysis and using models from three notebooks

**1.** Deberta v3 large (0.8392)
> [Inference BERT for usPatents](https://www.kaggle.com/code/leehann/inference-bert-for-uspatents)

**2.** Deberta v3 large (0.8338)
> [PPPM / Deberta-v3-large baseline [inference]](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-inference)

**3.** Roberta-large (0.8143)
> [PatentPhrase RoBERTa Inference](https://www.kaggle.com/code/santhoshkumarv/patentphrase-roberta-inference-lb-0-814)

#### Please upvote the original notebooks!

#### The modelling part of this notebook is copied from [here](https://www.kaggle.com/code/renokan/2-deberta-1-roberta-analysis-and-using). Thanks a ton, Anatoly Bureknok! However, the EDA is my own analysis. The model results are also ensembled according to what worked best for me.

It would be great if you could upvote the notebook if this is worth it! Thanks in advance!

In [None]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
from matplotlib.ticker import FuncFormatter
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
style.use('fivethirtyeight')
from matplotlib.ticker import FuncFormatter
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import os
import math
import re
from collections import Counter
from nltk import word_tokenize, pos_tag
import seaborn as sns
import spacy
import nltk
nlp = spacy.load("en_core_web_lg")

In [None]:
#Loading data directly in kaggle
train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
print(f"Train data shape: {train.shape}, test data shape: {test.shape}")

train.head()
train.context.value_counts()
print(f"Max score {train.score.max()} and min score {train.score.min()} ")

In [None]:
train[train.score == 1]

In [None]:
train[train.score == 0]

There are 106 contexts (or subjects) with H01 having the highest number of records.

1154 records have score=1. This means in these ecords, anchor and target are an exact match. However, for 7471 records with score=0, they are a complete mismatch. Let's analyze this within a subject.

In [None]:
#How many records with perfect match are there per context group?
context_counts = train[train.score == 1].groupby("context").id.count().reset_index().sort_values("id", ascending = False)
context_counts

In [None]:
context_counts[context_counts["context"]=="H01"]
#H01 has most number of records = only 34/2186 records have perefect match

In [None]:
train[(train.context == "H04")&(train.score == 1)].head()

In [None]:
train[(train.context == "H01")&(train.score == 1)].head()

In [None]:
#Maybe instead of looking at absolute lengths, we should look at relative lengths compared to the anchor
train["length_diff"] = 0
for i in train.index:
    train.length_diff.iloc[i] = len(train.target.iloc[i].split()) - len(train.anchor.iloc[i].split())
    
train.head()

In [None]:
train['broad_context'] = train['context'].str.get(0)
train.head(10)

In [None]:
import seaborn as sns
sns.countplot(x="broad_context", data=train).set(title= "Frequency plot of broad contexts")

*B has the most number of records. B stands for Operations and Transport.*

In [None]:
def prepare_column_for_pos(data, column_name = 'anchor'):   
    tok_and_tag = lambda x: pos_tag(word_tokenize(x))
    data['lower_'+ column_name] = data[column_name].apply(str.lower)
    data['tagged_'+ column_name] = data['lower_'+ column_name].apply(tok_and_tag)
    return data

train = prepare_column_for_pos(train, column_name = 'anchor')
train = prepare_column_for_pos(train, column_name = 'target')

In [None]:
pos_dict = {'NNS' : 'NN',
                 'NNP' : 'NN',
                 'JJR': 'JJ',
                 'JJS': 'JJ',
                 'PRP$' : 'PRP',
                 'RBR' : 'RB',
                 'RBS' : 'RB',
                 'VBD' : 'VB',
                 'VBG' : 'VB',
                 'VBN' : 'VB',
                 'VBP' : 'VB',
                 'VBZ' : 'VB',
                 'WP$' : 'WP'}

def replace_second_element(pos_tup_list):
    
    pos_tup_dict = dict(pos_tup_list)
    for k,v in pos_tup_dict.items():
        for word, replacement in pos_dict.items():
            v = v.replace(word, replacement)
            pos_tup_dict[k] = v
    pos_tup_list = [(k,v) for k,v in pos_tup_dict.items()]
    return pos_tup_list

train['tagged_anchor'] = train['tagged_anchor'].apply(replace_second_element)
train['tagged_target'] = train['tagged_target'].apply(replace_second_element)

train['tagged_target']

In [None]:
from itertools import chain
possible_tags_anchor = sorted(set(list(zip(*chain(*train['tagged_anchor'])))[1]))
possible_tags_anchor

In [None]:
possible_tags_target = sorted(set(list(zip(*chain(*train['tagged_target'])))[1]))
possible_tags_target

In [None]:
from collections import Counter
def add_pos_with_zero_counts(counter, keys_to_add):
    for k in keys_to_add:
        counter[k] = counter.get(k, 0)
    return counter

def make_categorical_pos_columns(data, column_name = "anchor", possible_tags=possible_tags_anchor ):
    data['pos_counts'] = data['tagged_'+column_name].apply(lambda x: Counter(list(zip(*x))[1]))
    data['pos_counts_with_zero'] = data['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags))
    data['sent_vector'] = data['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())])

    # All in one.
    data['sent_vector'] = data['tagged_'+column_name].apply(lambda x:
        [count for tag, count in sorted(
            add_pos_with_zero_counts(
                Counter(list(zip(*x))[1]), 
                        possible_tags).most_common()
             )
        ]
    )
    

    data_2 = pd.DataFrame(data['sent_vector'].tolist())
    data_2.columns = [x + "_" + column_name for x in possible_tags]
    data.drop(['pos_counts', 'pos_counts_with_zero', 'sent_vector'], axis=1, inplace=True)
    data = pd.concat([data.reset_index(), data_2.reset_index(drop=True)], axis=1)
    return data

def compute_difference_of_pos_metric(data, possible_tags_anchor):
    col1 = "anchor"
    col2 = "target"
    # Detailed steps.
    data = make_categorical_pos_columns(data, column_name = col1, possible_tags=possible_tags_anchor)
    data = make_categorical_pos_columns(data, column_name = col2, possible_tags=possible_tags_target)
    for col_prefix in possible_tags_anchor:
        data[col_prefix+"_diff"] = data[col_prefix+"_"+col2] - data[col_prefix+"_"+col1]
        data.drop([col_prefix+"_"+col1], axis=1, inplace=True)
        data.drop([col_prefix+"_"+col2], axis=1, inplace=True)
    data.drop(data.filter(regex='_anchor|_target').columns, axis=1, inplace=True)
    return data



print(train.shape)
train = compute_difference_of_pos_metric(train, possible_tags_anchor)
train.head()

In [None]:
train.NN_diff.value_counts()

In [None]:
corr_df=train[['score',
 'length_diff',
 'DT_diff',
 'IN_diff',
 'JJ_diff',
 'NN_diff',
 'PRP_diff',
 'RB_diff',
 'RP_diff',
 'TO_diff',
 'VB_diff',
 'WP_diff']].corr()
sns.heatmap(corr_df)

In [None]:

def jaccard_similarity(x,y):
    """ returns the jaccard similarity between two lists """
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

train['jacc_sim_score'] = train.apply(lambda x: jaccard_similarity(x.target, x.anchor), axis=1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(18, 10))
fig.suptitle('Trends of Similarity Score')
sns.scatterplot(ax=axes[0],data=train,  x="score",y="jacc_sim_score").set(title='Jaccard Similarity vs Score')
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
sns.scatterplot(ax=axes[1],data=train,  x="score",y="jacc_sim_score", hue="broad_context").set(title='Jaccard Similarity vs Score by Context')
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
def ttr(sent):
    sent = re.sub(r'[^\w]', ' ', sent)
    sent = sent.lower()
    tokens = nltk.word_tokenize(sent)
    types = nltk.Counter(tokens)
    ttr = len(types)/len(tokens)*100
    return ttr

def ttr_ratio(sent1,sent2):
    ttr1 = ttr(sent1)
    ttr2 = ttr(sent2)
    return ttr1/ttr2

train['ttr_comp_score'] = train.apply(lambda x: ttr_ratio(x.target, x.anchor), axis=1)

In [None]:
sns.scatterplot(data=train,  x="score",y="ttr_comp_score", hue="broad_context").set(title='TTR (Text Complexity) vs Score')
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
def calculate_shannon_entropy(string,base = 2.0):
    #make set with all unrepeatable symbols from string
    dct = dict.fromkeys(list(string))

    #calculate frequencies
    pkvec =  [float(string.count(c)) / len(string) for c in dct]

    #calculate Entropy
    H = -sum([pk  * math.log(pk) / math.log(base) for pk in pkvec ])
    return H

def shannon_entr_ratio(str1,str2):
    entr1 = calculate_shannon_entropy(str1)
    entr2 = calculate_shannon_entropy(str2)
    return np.log((entr1/entr2)+1)

train['entr_comp_score'] = train.apply(lambda x: shannon_entr_ratio(x.target, x.anchor), axis=1)

In [None]:
sns.scatterplot(data=train,  x="score",y="entr_comp_score", hue="broad_context").set(title='Shannon Entropy Ratio vs Score')
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
import re, math, collections
 
def tokenize(_str):
    stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
    tokens = collections.defaultdict(lambda: 0.)
    for m in re.finditer(r"(\w+)", _str, re.UNICODE):
        m = m.group(1).lower()
        if len(m) < 2: continue
        if m in stopwords: continue
        tokens[m] += 1
 
    return tokens
#end of tokenize
 

def kldiv(_s, _t):
    if (len(_s) == 0):
        return 1e33

    if (len(_t) == 0):
        return 1e33

    ssum = 0. + sum(_s.values())
    slen = len(_s)

    tsum = 0. + sum(_t.values())
    tlen = len(_t)

    vocabdiff = set(_s.keys()).difference(set(_t.keys()))
    lenvocabdiff = len(vocabdiff)

    """ epsilon """
    epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001

    """ gamma """
    gamma = 1 - lenvocabdiff * epsilon

    """ Check if distribution probabilities sum to 1"""
    sc = sum([v/ssum for v in _s.values()])
    st = sum([v/tsum for v in _t.values()])

    ps=[] 
    pt = [] 
 
    div = 0.
    for t, v in _s.items():
        pts = v / ssum
        ptt = epsilon
        if t in _t:
            ptt = gamma * (_t[t] / tsum)
        ckl = (pts - ptt) * math.log(pts / ptt)
        div +=  ckl
 
    return div
#end of kldiv

d1= "mixing core materials"
d2 = "core materials mixed"
print(kldiv(tokenize(d1), tokenize(d2)))

In [None]:
train["tok_target"] = train["target"].apply(tokenize)
train["tok_anchor"] = train["anchor"].apply(tokenize)
train['kl_div'] = train.apply(lambda x: kldiv(x.tok_target, x.tok_anchor), axis=1)
train.drop(['tok_target', 'tok_anchor'], axis=1, inplace = True, errors='ignore')

In [None]:
train.head(2)

In [None]:
corr_df=train[['score',
 'length_diff',
 'DT_diff',
 'IN_diff',
 'JJ_diff',
 'NN_diff',
 'PRP_diff',
 'RB_diff',
 'RP_diff',
 'TO_diff',
 'VB_diff',
 'WP_diff',
 'jacc_sim_score',
 'ttr_comp_score',
 'entr_comp_score',
 'kl_div']].corr()
sns.heatmap(corr_df)

We don't see a strong correlation with these structural entities. The correlation scores are all in the low-mid range w.r.t score. The number of Nouns and length difference between target and anchor show good correlation. This might indicate that most cases where there is a difference in the length of the 2 columns, this difference may be caused due to absence or presence of certain nouns. Since most nouns are more relevant to the context, they might actually contain relevant information.


We also see a high correlation with text similarity score, which makes sense. However, there is no correlation with TTR ratio of the two text columns. A similar trend is seen for KL divergence between the two strings.

# 1. Import & Def & Set & Load

In [None]:
import os
import gc
import random

import numpy as np
import pandas as pd

import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from dataclasses import dataclass

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

import warnings 
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True    
    torch.backends.cudnn.benchmark = False

    
def inference_fn(test_loader, model, device, is_sigmoid=True):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        if is_sigmoid == True:
            preds.append(output.sigmoid().to('cpu').numpy())
        else:
            preds.append(output.to('cpu').numpy())

    return np.concatenate(preds)    
    

def upd_outputs(data, is_trim=False, is_minmax=False, is_reshape=False):
    min_max_scaler = MinMaxScaler()
    
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data


In [None]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

CUSTOM_SEED = 426
CUSTOM_BATCH = 24
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
competition_dir = "../input/us-patent-phrase-to-phrase-matching/"

submission = pd.read_csv(competition_dir+'sample_submission.csv')
test_origin = pd.read_csv(competition_dir+'test.csv')
test_origin.head()

# 2. Extract predictions

## 2.1 Deberta v3 large - 1

In [None]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           max_length=cfg.max_len,
                           padding="max_length",
                           truncation=True)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg        
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.text[item])
        
        return inputs
   
    
class CustomModel(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, inputs):
        output = self.base(**inputs)

        return output[0]

In [None]:
seed_everything(CUSTOM_SEED)

In [None]:
class CFG:
    model_path='../input/deberta-v3-large/deberta-v3-large'
    batch_size=CUSTOM_BATCH
    num_workers=2
    max_len=130
    trn_fold=[0, 1, 2, 3]

CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

context_mapping = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")

In [None]:
test = test_origin.copy()
titles = pd.read_csv('../input/cpc-codes/titles.csv')

test.reset_index(inplace=True)
test = test.merge(titles, left_on='context', right_on='code')
test.sort_values(by='index', inplace=True)
test.drop(columns='index', inplace=True)

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
test['text'] = test['text'].apply(str.lower)

test.head()

In [None]:
deberta_predicts_1 = []

test_dataset = TestDataset(CFG, test)
test_dataloader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size, shuffle=False,
                             num_workers=CFG.num_workers,
                             pin_memory=True, drop_last=False)

deberta_simple_path = "../input/us-patent-deberta-simple/microsoft_deberta-v3-large"

for fold in CFG.trn_fold:
    fold_path = f"{deberta_simple_path}_best{fold}.pth"
    
    model = CustomModel(CFG.model_path)    
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_dataloader, model, DEVICE, is_sigmoid=False)
    
    deberta_predicts_1.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# -------------- inference_fn([...], is_sigmoid=False)
deberta_predicts_1 = [upd_outputs(x, is_minmax=True, is_reshape=True) for x in deberta_predicts_1]
deberta_predicts_1 = pd.DataFrame(deberta_predicts_1).T

deberta_predicts_1.head(10).style.background_gradient(cmap=cm, axis=1)

In [None]:
del test, test_dataset
gc.collect()

## 2.2 Deberta v3 large - 2

In [None]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        
        return output

In [None]:
seed_everything(CUSTOM_SEED)

In [None]:
class CFG:
    num_workers=2
    path="../input/pppm-deberta-v3-large-baseline-w-w-b-train/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=CUSTOM_BATCH
    fc_dropout=0.2
    target_size=1
    max_len=133
    trn_fold=[0, 1, 2, 3]
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

context_mapping = torch.load(CFG.path+"cpc_texts.pth")

In [None]:
test = test_origin.copy()

test['context_text'] = test['context'].map(context_mapping)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']

test.head()

In [None]:
deberta_predicts_2 = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.path + f"{CFG.model.replace('/', '-')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_fold{fold}_best.pth"
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state['model'])
    
    prediction = inference_fn(test_loader, model, DEVICE)
    deberta_predicts_2.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
deberta_predicts_2 = [upd_outputs(x, is_reshape=True) for x in deberta_predicts_2]
deberta_predicts_2 = pd.DataFrame(deberta_predicts_2).T

deberta_predicts_2.head(10).style.background_gradient(cmap=cm, axis=1)

In [None]:
del test, test_dataset
gc.collect()

## 2.3. Roberta-large

In [None]:
def prepare_input(cfg, text, target):
    inputs = cfg.tokenizer(text, target,
                           padding="max_length",
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.target = df['target'].values
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        target = self.target[item]
        
        inputs = prepare_input(self.cfg, text, target)
        
        return inputs

    
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7

        config = AutoConfig.from_pretrained(CFG.config_path)

        config.update({"output_hidden_states": True,
                       "hidden_dropout_prob": hidden_dropout_prob,
                       "layer_norm_eps": layer_norm_eps,
                       "add_pooling_layer": False})
        
        self.transformer = AutoModel.from_pretrained(CFG.config_path, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.output = nn.Linear(config.hidden_size, CFG.num_targets)
        
    def forward(self, inputs):
        transformer_out = self.transformer(**inputs)
        last_hidden_states = transformer_out[0]
        last_hidden_states = self.dropout(torch.mean(last_hidden_states, 1))
        logits1 = self.output(self.dropout1(last_hidden_states))
        logits2 = self.output(self.dropout2(last_hidden_states))
        logits3 = self.output(self.dropout3(last_hidden_states))
        logits4 = self.output(self.dropout4(last_hidden_states))
        logits5 = self.output(self.dropout5(last_hidden_states))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        
        return logits

In [None]:
seed_everything(CUSTOM_SEED)

In [None]:
@dataclass(frozen=True)
class CFG:
    num_workers=2
    config_path='../input/robertalarge'
    model_path='../input/phrase-matching-roberta-training-pytorch-wandb'
    model_name='roberta-large'
    batch_size=CUSTOM_BATCH
    max_len=128
    num_targets=1
    trn_fold=[0, 1, 2, 3, 4]
    tokenizer=AutoTokenizer.from_pretrained('../input/robertalarge')

context_mapping = {
        "A": "Human Necessities",
        "B": "Operations and Transport",
        "C": "Chemistry and Metallurgy",
        "D": "Textiles",
        "E": "Fixed Constructions",
        "F": "Mechanical Engineering",
        "G": "Physics",
        "H": "Electricity",
        "Y": "Emerging Cross-Sectional Technologies",
}

In [None]:
test = test_origin.copy()

test['context_text'] = test['context'].str.slice(stop=1).map(context_mapping)
test['text'] = test['context_text'] + ' ' + test['anchor']

In [None]:
test.head()

In [None]:
roberta_predicts = []

test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

folds_path = CFG.model_path + f"/{CFG.model_name.replace('-','_')}"

for fold in CFG.trn_fold:
    fold_path = f"{folds_path}_patent_model_{fold}.pth"
    
    model = CustomModel()
    state = torch.load(fold_path, map_location=torch.device('cpu'))  # DEVICE
    model.load_state_dict(state)

    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, state, prediction
    torch.cuda.empty_cache()    
    gc.collect()

In [None]:
roberta_predicts = [upd_outputs(x, is_reshape=True) for x in roberta_predicts]
roberta_predicts = pd.DataFrame(roberta_predicts).T

roberta_predicts.head(10).style.background_gradient(cmap=cm, axis=1)

In [None]:
del test, test_dataset
gc.collect()

# 3. Comparison / Ensemble

In [None]:
all_predictions = pd.concat(
    [deberta_predicts_1, deberta_predicts_2, roberta_predicts],
    keys=['deberta 1', 'deberta 2', 'roberta'],
    axis=1
)

all_predictions.head(10) \
    .assign(mean=lambda x: x.mean(axis=1)) \
        .style.background_gradient(cmap=cm, axis=1)

In [None]:
all_mean = pd.DataFrame({
    'deberta 1': deberta_predicts_1.mean(axis=1),
    'deberta 2': deberta_predicts_2.mean(axis=1),
    'roberta': roberta_predicts.mean(axis=1)
})

all_mean.head(10) \
    .assign(mean=lambda x: x.mean(axis=1)) \
        .style.highlight_max(axis=1, props=props_param)

In [None]:
# === N1 ===
weights_ = [0.85, 0.1, 0.05]
final_predictions1 = all_mean.mul(weights_).sum(axis=1)

# === N2 ===
final_predictions2 = all_mean.mean(axis=1)

final_predictions = np.mean([final_predictions1, final_predictions2], axis=0)
final_predictions

In [None]:
print("Shape of preds 1:",len(final_predictions1))
print("Shape of preds 2:",len(final_predictions2))
print("Shape of final preds:",len(final_predictions))

# 4. Submission

In [None]:
submission = pd.DataFrame({
    'id': test_origin['id'],
    'score': final_predictions,
})

submission.head(14)

In [None]:
submission.to_csv('submission.csv', index=False)