## Set up

In [None]:
!pip install -Uqq datasets wandb

In [None]:
import os
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict, load_metric
import wandb

from pathlib import Path
input_dir = Path("/kaggle/input/us-patent-phrase-to-phrase-matching/")

%env TOKENIZERS_PARALLELISM=true
%env WANDB_ENTITY=arampacha
%env WANDB_PROJECT=patent_phrase_matching
%env WANDB_WATCH=false

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("wandb_api_key")
wandb.login(key=wandb_api_key)

In [None]:
df = pd.read_csv(input_dir/"train.csv")
df.head()

In [None]:
category = df.context.map(lambda x: x[0]).tolist()

In [None]:
counts = Counter(category)

In [None]:
x = sorted(list(counts.keys()))

In [None]:
test_df = pd.read_csv(input_dir/"test.csv")

test_counts = Counter(test_df.context.map(lambda x: x[0]).tolist())
test_counts

In [None]:
fig, axs = plt.subplots(2,1)

sns.barplot(x=x, y=[counts[k] for k in x], ax=axs[0])
sns.barplot(x=x, y=[test_counts[k] for k in x], ax=axs[1])
axs[0].set_title("train")
axs[1].set_title("test")
plt.show()

In [None]:
anchors = set(df.anchor.tolist())
test_anchors = set(test_df.anchor.tolist())
len(anchors), len(test_anchors)

In [None]:
anchors.intersection(test_anchors)

In [None]:
targets = set(df.target.tolist())
test_targets = set(test_df.target.tolist())
len(targets), len(test_targets)

In [None]:
len(anchors.intersection(targets))

In [None]:
len(test_anchors.intersection(test_targets))

In [None]:
len(anchors.intersection(targets))

## Baseline training

In [None]:
# model_id = "google/electra-small-discriminator"
model_id = "microsoft/deberta-v3-xsmall"

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
cls_map = {k:v for v, k in enumerate(sorted(list(df.score.unique())))}

def preprocess(example):
    res = {}
    res["text"] = f"Category: {example['context']} Text 1: {example['anchor']} Text 2: {example['target']}"
    if "score" in example.keys():
        res["cls"] = cls_map[example["score"]]
    return res

In [None]:
dataset = DatasetDict({
    "train":Dataset.from_pandas(df),
    "test":Dataset.from_pandas(test_df)
})
dataset = dataset.map(preprocess)
dataset["train"][0]

In [None]:
def tokenize(batch):
    toks = tokenizer(batch["text"])
    if "score" in batch.keys():
        toks["labels"] = batch["score"]
    return toks

train_dataset = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
test_dataset = dataset["test"].map(tokenize, batched=True, remove_columns=dataset["test"].column_names)

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(shuffle=True, random_state=124)

In [None]:
for fold_id, (train_idx, valid_idx) in enumerate(skf.split(dataset["train"], dataset["train"]["cls"])):
    break

In [None]:
lr = 8e-5

training_args = TrainingArguments(
    output_dir="tmp/",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    learning_rate=lr,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    num_train_epochs=5,
    max_steps=-1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    seed=12,
    fp16=torch.cuda.is_available(),
    dataloader_drop_last=False,
    dataloader_num_workers=min(4, os.cpu_count()),
    load_best_model_at_end=True,
    metric_for_best_model="pearsonr",
    greater_is_better=True,
    report_to="wandb",
)

data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
pearson_metric = load_metric("pearsonr")

def compute_metrics(prediction_output):
    predictions = prediction_output.predictions
    labels = prediction_output.label_ids
    return pearson_metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, 
    tokenizer=tokenizer,
    train_dataset=train_dataset.select(train_idx), 
    eval_dataset=train_dataset.select(valid_idx),
    compute_metrics=compute_metrics
)

In [None]:
wandb.init(name=f"simple-{model_id.split('/')[-1]}-fold{fold_id}", group=f"simple-{model_id.split('/')[-1]}")

In [None]:
trainer.train()

In [None]:
!rm -rf ./tmp

In [None]:
trainer.save_model(f"fold_{fold_id}")

In [None]:
prediction_outputs = trainer.predict(test_dataset)

In [None]:
test_scores = prediction_outputs.predictions.squeeze().clip(0., 1.)
submission = pd.DataFrame({"id":test_df["id"], "score":test_scores})
submission.to_csv("submission.csv", index=False)