In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("../input/patentbert-simple/fold_0")
model = AutoModelForSequenceClassification.from_pretrained("../input/patentbert-simple/fold_0/").to(device)

In [None]:
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_df.head()

In [None]:
class TextDataset(Dataset):
    
    def __init__(self, df, tokenizer):
        texts = ("Category: " + df.context + " Text 1: " + df.anchor + " Text 2: " + df.target).tolist()
        self.data = [tokenizer(text) for text in texts]
        
    def __getitem__(self, i):
        return self.data[i]
    
    def __len__(self):
        return len(self.data)

In [None]:
dataset = TextDataset(test_df, tokenizer)
data_collator = DataCollatorWithPadding(tokenizer)
test_dl = DataLoader(dataset, batch_size=32, collate_fn=data_collator)

In [None]:
model.eval()
with torch.no_grad():
    preds = torch.cat([model(**b.to(device)).logits for b in test_dl]).squeeze().numpy().clip(0,1)

In [None]:
submission = pd.DataFrame({"id":test_df["id"], "score":preds})
submission.to_csv("submission.csv", index=False)