In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import shutil
import torch

from datasets import Dataset, DatasetDict
from sklearn.metrics import (
    accuracy_score, classification_report, 
    ConfusionMatrixDisplay, confusion_matrix)
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import DataLoader, SubsetRandomSampler
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler)

%matplotlib inline

In [None]:
DATA_DIR = "../input/us-patent-phrase-to-phrase-matching/"

TRAIN_FILE = os.path.join(DATA_DIR, "train.csv")
TEST_FILE = os.path.join(DATA_DIR, "test.csv")
SUBMISSION_FILE = "./submission.csv"

MODEL_ID = "microsoft/deberta-v3-xsmall"
MODEL_DIR = "deberta-patent-matching-02"

BATCH_SIZE = 32

LEARNING_RATE = 5e-5
WEIGHT_DECAY = 1e-2
NUM_EPOCHS = 5

## Dataset

Quick look at the dataset provided.

We can't use the test data for validating or evaluation, since there are no scores attached. We will split our training 70/10/20 for this.

We will use our trained model against `test.csv` to generate `submission.csv` consisting of (ID, score) pairs.

In [None]:
train_df = pd.read_csv(TRAIN_FILE)
train_df.head()

In [None]:
test_df = pd.read_csv(TEST_FILE)
test_df.head()

In [None]:
len(train_df), len(test_df)

## Labels

Given that the scores seem like discrete points in a (0, 1) scale, it might make sense to treat this as a classification problem.

In [None]:
scores = sorted(list(set(train_df["score"].values.tolist())))
score2label = {s:i for i, s in enumerate(scores)}
score2label

## Raw Dataset

In [None]:
train_df["labels"] = train_df.apply(lambda x: score2label[x["score"]], axis=1)
train_df.head()

In [None]:
train_df, testval_df = train_test_split(train_df, test_size=0.3)
val_df, test_df = train_test_split(testval_df, test_size=0.3)
len(train_df), len(val_df), len(test_df)

In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

raw_ds = DatasetDict()
raw_ds["train"] = train_ds
raw_ds["validation"] = val_ds
raw_ds["test"] = test_ds

raw_ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
sep = tokenizer.sep_token

In [None]:
# treat context keys as special tokens
special_tokens = train_df["context"].unique().tolist()
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

In [None]:
def compose_input(example):
    text = sep.join([example["anchor"], 
                       example["target"],
                       example["context"]])
    example["text"] = text
    return example



raw_ds = raw_ds.map(compose_input, 
                    remove_columns=["id", "anchor", 
                                    "target", "context", 
                                    "score", "__index_level_0__"])
raw_ds

In [None]:
raw_ds["train"][0]

## Encoded Dataset

In [None]:
def tokenize_text(example):
    return tokenizer(example["text"])


train_ds = raw_ds["train"].map(tokenize_text, remove_columns=["text"])
val_ds = raw_ds["validation"].map(tokenize_text, remove_columns=["text"])
test_ds = raw_ds["test"].map(tokenize_text, remove_columns=["text"])

train_ds

## DataLoader

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer, padding="longest", return_tensors="pt")

train_dl = DataLoader(train_ds, 
                      shuffle=True, 
#                       sampler=SubsetRandomSampler(np.random.randint(0, train_ds.num_rows, 1000).tolist()),
                      batch_size=BATCH_SIZE, 
                      collate_fn=collate_fn)
valid_dl = DataLoader(val_ds,
                      shuffle=False, 
#                       sampler=SubsetRandomSampler(np.random.randint(0, val_ds.num_rows, 200).tolist()),                      
                      batch_size=BATCH_SIZE, 
                      collate_fn=collate_fn)
test_dl = DataLoader(test_ds,
                     shuffle=False,
#                      sampler=SubsetRandomSampler(np.random.randint(0, test_ds.num_rows, 100).tolist()),                     
                     batch_size=BATCH_SIZE, 
                     collate_fn=collate_fn)

## Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, 
                                                           num_labels=len(score2label))
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=LEARNING_RATE,
                  weight_decay=WEIGHT_DECAY)

num_training_steps = NUM_EPOCHS * len(train_dl)
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

In [None]:
def do_train(model, train_dl):
    model.train()
    train_loss = 0
    for bid, batch in enumerate(train_dl):
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.detach().cpu().numpy()
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    
    return train_loss


def compute_accuracy(labels, logits):
    # convert logits to predictions and move to CPU
    preds_cpu = torch.argmax(logits, dim=-1).cpu().numpy()
    labels_cpu = labels.cpu().numpy()
    return accuracy_score(labels_cpu, preds_cpu)


def do_eval(model, eval_dl):
    model.eval()
    eval_loss, eval_score, num_batches = 0, 0, 0
    for bid, batch in enumerate(eval_dl):
        batch = {k:v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        loss = outputs.loss
        eval_loss += loss.detach().cpu().numpy()
        eval_score += compute_accuracy(batch["labels"], outputs.logits)
        num_batches += 1
    
    eval_score /= num_batches
    return eval_loss, eval_score


def save_checkpoint(model, model_dir, epoch):
    model.save_pretrained(os.path.join(MODEL_DIR, "ckpt-{:d}".format(epoch)))
    

def save_training_history(history, model_dir, epoch):
    fhist = open(os.path.join(MODEL_DIR, "history.tsv"), "w")
    for epoch, train_loss, eval_loss, eval_score in history:
        fhist.write("{:d}\t{:.5f}\t{:.5f}\t{:.5f}\n".format(
            epoch, train_loss, eval_loss, eval_score))
    fhist.close()

## Training / Finetuning

In [None]:
if os.path.exists(MODEL_DIR):
    shutil.rmtree(MODEL_DIR)
    os.makedirs(MODEL_DIR)
    
history = []
for epoch in range(NUM_EPOCHS):
    train_loss = do_train(model, train_dl)
    eval_loss, eval_score = do_eval(model, valid_dl)
    history.append((epoch + 1, train_loss, eval_loss, eval_score))
    print("EPOCH {:3d} | train loss: {:.3f} | val loss: {:.3f} | val acc: {:.5f}".format(
        epoch + 1, train_loss, eval_loss, eval_score))
    save_checkpoint(model, MODEL_DIR, epoch + 1)
    save_training_history(history, MODEL_DIR, epoch + 1)

In [None]:
plt.subplot(2, 1, 1)
plt.plot([train_loss for _, train_loss, _, _ in history], label="train")
plt.plot([eval_loss for _, _, eval_loss, _ in history], label="validation")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend(loc="best")

plt.subplot(2, 1, 2)
plt.plot([eval_score for _, _, _, eval_score in history], label="validation")
plt.xlabel("epochs")
plt.ylabel("f1-score")
plt.legend(loc="best")

plt.tight_layout()
_ = plt.show()

## Evaluation against held out test set

In [None]:
_, test_acc = do_eval(model, test_dl)
print("test accuracy: {:.3f}".format(test_acc))

In [None]:
def evaluate(model, eval_dl):
    model.eval()
    labels, preds = [], []
    eval_loss, eval_score, num_batches = 0, 0, 0
    for bid, batch in enumerate(eval_dl):
        batch = {k:v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        preds_b = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        preds.extend(preds_b)
        if "labels" in batch:
            labels_b = batch["labels"].cpu().numpy()
            labels.extend(labels_b)
    
    return labels, preds


labels, preds = evaluate(model, test_dl)

label2score = {label:str(score) for score, label in score2label.items()}
target_names = [str(label2score[label]) for label in range(len(label2score))]

print(classification_report(labels, preds, target_names=target_names))

In [None]:
def plot_confusion_matrix(ytrue, ypreds, labels):
    cm = confusion_matrix(ytrue, ypreds, normalize="true")
    fig, ax = plt.subplots(figsize=(12, 12))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format="0.2f", ax=ax, colorbar=False)
    plt.title("Normalized Confusion Matrix")
    _ = plt.show()


plot_confusion_matrix(labels, preds, target_names)

## Predict scores against provided test set

In [None]:
test_df = pd.read_csv(TEST_FILE)
test_df.head()

In [None]:
ids = test_df["id"].tolist()
len(ids)

In [None]:
def predict(anchor, target, context, model, tokenizer, label2score):
    text = " | ".join([anchor, target, context])
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k:v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    pred_label = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    return label2score[pred_label[0]]


fout = open(SUBMISSION_FILE, "w")
fout.write("id,score\n")

for index, row in test_df.iterrows():
    pred_score = predict(row.anchor, row.target, row.context, 
                         model, tokenizer, label2score)
    print(row.id, pred_score)
    fout.write("{:s},{:s}\n".format(row.id, pred_score))

fout.close()