In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import Dataset

In [None]:
def prepare_df(df, tokenizer):
    df = df.rename(columns={"score": "label"})
    sep = " " + tokenizer.sep_token + " "
    df["section"] = df["context"].map(lambda val: val.strip()[0])
    df["sec_tok"] = "[" + df["section"] + "]"
    df["inputs"] = (df["sec_tok"] + sep + 
                    df["context"] + sep + 
                    df["anchor"].str.lower() + sep + 
                    df["target"].str.lower()
                   )
    return df

def get_ds(df, tok_func):
    ds = Dataset.from_pandas(df)
    remove_cols = ["id", "anchor", "target", "context", "section"]
    ds = ds.map(tok_func, batched=True, remove_columns=remove_cols)
    return ds

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("../input/patents-comp/model")
tokenizer = AutoTokenizer.from_pretrained("../input/patents-comp/tokenizer")
trainer = Trainer(model, tokenizer=tokenizer)

In [None]:
# Evaluation
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
test_df = prepare_df(test_df, tokenizer)
test_ds = get_ds(test_df,  tok_func=lambda x: tokenizer(x["inputs"]))

In [None]:
preds = trainer.predict(test_ds).predictions.astype(float)
preds = np.clip(preds, 0, 1).reshape(-1)

sub_df = pd.DataFrame({
    "id": test_df["id"],
    "score": preds
})
sub_df.to_csv("submission.csv", index=False)