<ul>
<li>The idea is simple to train deberta-v3-large on kfolds and then generating predictions using the model trained on each fold, so we train total <b>6 models</b> for each fold. </li>
<li>Then the preditions were calibrated using thresholds</li>
</ul>

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

# 1. Import & Set & Def & Load

In [None]:
import os
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import TrainingArguments,Trainer

from sklearn.preprocessing import MinMaxScaler

In [None]:
"""
To load and save pretrained model and tokenizer
"""

model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=1)

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

In [None]:
class TrainDataset(Dataset):
  def __init__(self,text,label,tokenizer):
    self.sentence=text
    self.label=label
    self.tokenizer=tokenizer

  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,idx):
    inp_tokens=self.tokenizer.encode_plus(self.sentence[idx], 
                                          padding="max_length", 
                                          add_special_tokens=True,
                                          max_length=35, 
                                          truncation=True)
    inp_id=inp_tokens.input_ids
    inp_mask=inp_tokens.attention_mask
    inp_type_ids=inp_tokens.token_type_ids
    labels=self.label[idx]

    return {
        "input_ids":torch.tensor(inp_id, dtype=torch.long),
        "attention_mask":torch.tensor(inp_mask, dtype=torch.long),
        "token_type_ids":torch.tensor(inp_type_ids, dtype=torch.long),
        "label":torch.tensor(labels, dtype=torch.float)
    }

## Preparing Data for training

In [None]:
cpc_texts = torch.load("../input/folddump/cpc_texts.pth")
titles = pd.read_csv('../input/upppm/titles.csv')

In [None]:
df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
df = df.merge(titles, left_on='context', right_on='code')


df.reset_index(inplace=True)
df = df.merge(titles, left_on='context', right_on='code')
df.sort_values(by='index', inplace=True)
df.drop(columns='index', inplace=True)

df['context_text'] = df['context'].map(cpc_texts)


We'll need to combine the context, anchor, and target together somehow. There's not much research as to the best way to do this, so we may need to iterate a bit. To start with, we'll just combine them all into a single string. The model will need to know where each section starts, so we can use the special separator **[SEP]** token to tell it:



In [None]:
df['inputs'] = df.context_text + '[SEP]' + df.anchor + '[SEP]' + df.target

df['inputs'] = df['inputs'].apply(str.lower)
df.rename(columns = {'score': 'label'}, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
pscores=[]

In [None]:
def corr(eval_pred): 
    logits, labels = eval_pred
    logits = logits.reshape(-1)
    pscores.append(np.corrcoef(logits, labels)[0][1])
    return  {'pearson': np.corrcoef(logits, labels)[0][1]}

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=6, random_state=0, shuffle=True)

HuggingFace Transformers tends to be rather enthusiastic about spitting out lots of warnings, so let's quieten it down for our sanity:

In [None]:
import warnings,logging

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

# Training on seperate files

In [None]:
lr,bs = 2e-5,64
wd,epochs = 0.01,4

We can now create a tokenizer for this model. Note that pretrained models assume that text is tokenized in a particular way. In order to ensure that your tokenizer matches your model, use the AutoTokenizer, passing in your model name.



In [None]:
# tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/My docs/patents_bert')

In [None]:
i=1

train_df = pd.read_csv(f'../input/kfold-data/kfold/df_train{i}.csv')
val_df = pd.read_csv(f'../input/kfold-data/kfold/df_val{i}.csv')

In [None]:
#first generating fold csv's and then training them one by one manually
train_dataset = TrainDataset(train_df['inputs'].values, train_df['label'].values, tokenizer)
val_dataset = TrainDataset(val_df['inputs'].values, val_df['label'].values, tokenizer)

train_dataloader=DataLoader(train_dataset,
                            batch_size=2*bs,
                            shuffle=True,
                            num_workers=2,
                          pin_memory=True, collate_fn=lambda x: x)

val_dataloader=DataLoader(val_dataset,
                            batch_size=2*bs,
                            shuffle=False,
                            num_workers=2,
                          pin_memory=True, collate_fn=lambda x: x)

# model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/My docs/patents_bert', num_labels=1)

for (dataTrain, dataVal) in zip(train_dataloader, val_dataloader):

  args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, per_device_train_batch_size=bs, 
                           num_train_epochs=epochs, weight_decay=wd, report_to='none')
  
  trainer = Trainer(model, args, train_dataset=dataTrain)
  trainer.train()
trainer.save_model(f'out_fold{i}')

In [None]:
np.nanmean(pscores)

# To train on every fold at once

In [None]:
#to generate folds and train and save models at once (this process doesn't fit in the memory)

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

i=1

for train_index, val_index in kf.split(df[['inputs', 'label']]):
  train_dataset = TrainDataset(df['inputs'].iloc[train_index].values, df['label'].iloc[train_index].values, tokenizer)
  val_dataset = TrainDataset(df['inputs'].iloc[val_index].values, df['label'].iloc[val_index].values, tokenizer)

  train_dataloader=DataLoader(train_dataset,
                              batch_size=bs,
                              shuffle=True,
                              num_workers=2,
                            pin_memory=True, collate_fn=lambda x: x)
  
  val_dataloader=DataLoader(val_dataset,
                              batch_size=bs,
                              shuffle=False,
                              num_workers=2,
                            pin_memory=True, collate_fn=lambda x: x)

  model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-large', num_labels=1)

  for (dataTrain, dataVal) in zip(train_dataloader, val_dataloader):

    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
      evaluation_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=2*8,
      num_train_epochs=epochs, weight_decay=wd, report_to='none')
    
    trainer = Trainer(model, args, train_dataset=dataTrain, eval_dataset=dataVal , compute_metrics=corr)
    trainer.train()
  trainer.save_model(f'out_fold{i}')
  i += 1

In [None]:
np.nanmean(pscores)

# Test

In [None]:
def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i.to(device) for i in batch]
    
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        
        preds.append(y_preds.to('cpu').numpy())
    
    predictions = np.concatenate(preds)
    
    return predictions


min_max_scaler = MinMaxScaler()

def upd_outputs(data, is_trim=True, is_minmax=True, is_reshape=True):
    """\o/"""
    if is_trim == True:
        data = np.where(data <=0, 0, data)
        data = np.where(data >=1, 1, data)

    if is_minmax ==True:
        data = min_max_scaler.fit_transform(data)
    
    if is_reshape == True:
        data = data.reshape(-1)
        
    return data


In [None]:
test_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv
titles = pd.read_csv('../input/upppm/titles.csv')

test_df.reset_index(inplace=True)
test_df = test_df.merge(titles, left_on='context', right_on='code')
test_df.sort_values(by='index', inplace=True)
test_df.drop(columns='index', inplace=True)

cpc_texts = torch.load("../input/folddump/cpc_texts.pth")

test_df['context_text'] = test_df['context'].map(cpc_texts)
test_df['inputs'] = test_df['anchor'] + '[SEP]' + test_df['target'] + '[SEP]'  + test_df['context_text']
test_df['inputs'] = test_df['inputs'].apply(str.lower)

test_df.head()

# 2. Extract & Update Predictions

In [None]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['inputs'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True)
        
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long), \
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)

In [None]:
def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i for i in batch]
    
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        
        preds.append(np.array(y_preds['logits'].flatten().to('cpu').numpy()))
    print(preds)
    predictions = np.concatenate(preds)
    
    return predictions

In [None]:
predictions = []

tokenizer = AutoTokenizer.from_pretrained('../input/debertav3kfold/deberta_v3_large-20220602T055706Z-001/deberta_v3_large')

te_dataset = TestDataset(test_df, tokenizer, 35)

te_dataloader = DataLoader(te_dataset,
                          batch_size=32, shuffle=False,
                          num_workers = 2,
                          pin_memory=True, drop_last=False)


for fold in tqdm(range(1, 7)):
    
    fold_path = f"../input/debertav3kfold/deberta_V3_kfold/deberta_V3_kfold/out_fold{fold}"
    
    model = AutoModelForSequenceClassification.from_pretrained(fold_path, num_labels=1)
    
    prediction = valid_fn(te_dataloader, model)

    predictions.append(prediction)

In [None]:
print("folds:", len(predictions))
print("rows: ", len(predictions[0]))
print("score:", predictions[0][0])

In [None]:
n_predictions = 14

In [None]:
# first fold
predictions[0][:n_predictions]

In [None]:
# print(*upd_outputs(predictions[0].reshape(-1,1), is_trim=False)[:n_predictions])
# print(*upd_outputs(predictions[0].reshape(-1,1), is_minmax=False)[:n_predictions])

In [None]:
# np.where(x<=0, 0, x) .. >> min_max.fit_transform(x) >> x.reshape(-1)
upd_predictions = [upd_outputs(x.reshape(-1,1), is_trim=False) for x in predictions]

In [None]:
print(*upd_predictions[0][:n_predictions])

# 3. Additional & Final Predictions

In [None]:
origin_predictions = upd_predictions.copy()  # 5. Visualization

In [None]:
# === add np.median ===
add_preds = []
for x in zip(*upd_predictions):
    add_preds.append(np.median(x, axis=0))
    
upd_predictions.append(add_preds)

In [None]:
# === add np.mean ===
add_preds = []
for x in zip(*upd_predictions):
    add_preds.append(np.mean(x, axis=0))
    
upd_predictions.append(add_preds)

In [None]:
final_predictions = np.mean(upd_predictions, axis=0)

In [None]:
print(*final_predictions[:n_predictions])

In [None]:
print(*final_predictions[:n_predictions])

# 4. Create & Calibrate Submissions

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': final_predictions,
})

submission.head(14)

In [None]:
submission.to_csv('submission.csv', index=False)