In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path
import warnings
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
def prepare_df(df, tokenizer):
    df = df.rename(columns={"score": "label"})
    sep = " " + tokenizer.sep_token + " "
    df["section"] = df["context"].map(lambda val: val.strip()[0])
    df["sec_tok"] = "[" + df["section"] + "]"
    df["inputs"] = (df["sec_tok"] + sep + 
                    df["context"] + sep + 
                    df["anchor"].str.lower() + sep + 
                    df["target"].str.lower()
                   )
    return df

In [None]:
def get_ds(df, tok_func):
    ds = Dataset.from_pandas(df)
    remove_cols = ["id", "anchor", "target", "context", "section"]
    ds = ds.map(tok_func, batched=True, remove_columns=remove_cols)
    return ds

def get_model(model_name):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    return model

def get_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer

def tok_func(x): 
    return tokz(x["inputs"])


def corr(eval_pred): 
    return {'pearson': np.corrcoef(*eval_pred)[0][1]}

In [None]:
model_name = "microsoft/deberta-v3-small"

model = get_model(model_name)
tokenizer = get_tokenizer(model_name)

In [None]:
DATA_DIR = Path("../input/us-patent-phrase-to-phrase-matching")

train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "train.csv")

train_df = prepare_df(train_df, tokenizer)
test_df = prepare_df(test_df, tokenizer)

tokenizer.add_special_tokens({"additional_special_tokens": list(train_df.sec_tok.unique())})
model.resize_token_embeddings(len(tokenizer))
print(tokenizer.special_tokens_map)

In [None]:
train_df.info()

In [None]:
np.random.seed(42)
anchors = np.random.permutation(train_df["anchor"].unique())
valid_anchor_count = int(len(anchors) * 0.25)
valid_anchors = anchors[:valid_anchor_count]
valid_indxs = train_df[train_df["anchor"].isin(valid_anchors)].index
train_indxs = train_df[~train_df["anchor"].isin(valid_anchors)].index

train_ds = get_ds(train_df.iloc[train_indxs], tok_func=lambda x: tokenizer(x["inputs"]))
valid_ds = get_ds(train_df.iloc[valid_indxs], tok_func=lambda x: tokenizer(x["inputs"]))

In [None]:
def get_trainer(model, train_ds, eval_ds, tokenizer, **training_args):
    args = TrainingArguments("outputs", **training_args)
    trainer = Trainer(model, 
                      args, 
                      train_dataset=train_ds, 
                      eval_dataset=eval_ds, 
                      tokenizer=tokenizer, 
                      compute_metrics=corr)
    return trainer

In [None]:
training_args = dict(
    learning_rate=8e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,
    lr_scheduler_type='cosine',
    evaluation_strategy='epoch',
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    num_train_epochs=4,
    report_to='none'
)
trainer = get_trainer(model, train_ds, valid_ds, tokenizer, **training_args)

In [None]:
train_ds

In [None]:
trainer.train()

In [None]:
model.save_pretrained("deberta_small_v1/model")
tokenizer.save_pretrained("deberta_small_v1/tokenizer")

In [None]:
!ls "/kaggle/working/model"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/model")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/tokenizer")
trainer = Trainer(model, tokenizer=tokenizer)

In [None]:
# Evaluation
test_df = pd.read_csv(DATA_DIR / "test.csv")
test_df = prepare_df(test_df, tokenizer)
test_ds = get_ds(test_df,  tok_func=lambda x: tokenizer(x["inputs"]))

In [None]:
import shutil

shutil.make_archive("deberta_small_v1", 'zip', "/kaggle/working/deberta_small_v1")

In [None]:
preds = trainer.predict(test_ds).predictions.astype(float)
preds = np.clip(preds, 0, 1).reshape(-1)

sub_df = pd.DataFrame({
    "id": test_df["id"],
    "score": preds
})
sub_df.to_csv("submission.csv", index=False)