In [None]:
import os
import datasets, transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForMaskedLM

import numpy as np

os.environ["WANDB_DISABLED"] = "true"

<font size=17> CONFIG </font>

In [None]:
class CFG:
    
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
#     model_path = '../input/roberta-base'
    model_path = "../input/bert-for-patents/bert-for-patents"
#     model = 'roberta-base'
    model = "bert-for-patents"
    
    learning_rate = 2e-5
    weight_decay = 0.01
    
    epochs = 5
    batch_size = 32
    

<font size=17> Preprocess context </font>

In [None]:
table = """
A: Human Necessities
B: Operations and Transport
C: Chemistry and Metallurgy
D: Textiles
E: Fixed Constructions
F: Mechanical Engineering
G: Physics
H: Electricity
Y: Emerging Cross-Sectional Technologies
"""
splits = [i for i in table.split('\n') if i != '']
table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
table

<font size=17> Load model and tokenizer </font>

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)

tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

<font size=17> Load dataset </font>

In [None]:
train = datasets.Dataset.from_csv(CFG.input_path + 'train.csv')
train

<font size=17> Tokenize </font>

In [None]:
def process(unit, eval = False):
    
    sig = unit['context'][0]
    prefix = table[sig]
    text = unit['anchor']
    
    return {
        **tokenizer( prefix + text, unit['target']),
        'label':unit['score']
    }

encoded_ds = train.map(process, remove_columns= ['id', 'anchor', 'target', 'context', 'score'])

<font size=17> Split into train & valid</font>

In [None]:
encoded_ds = encoded_ds.train_test_split(test_size=0.1)
encoded_ds

<font size=17> Training setup</font>

In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }


args = TrainingArguments(
    f"uspppm",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    num_train_epochs=CFG.epochs,
    weight_decay=CFG.weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

<font size=17> Test if everything's OK</font>

In [None]:
trainer.evaluate()

<font size=17> Train it! </font>

In [None]:
trainer.train()

<font size=17> Make predictions </font>

In [None]:
def test_process(unit, eval = False):
    
    sig = unit['context'][0]
    prefix = table[sig]
    text = unit['anchor']
    
    return {
        **tokenizer( prefix + text, unit['target']),
        'label':-1
    }



test = datasets.Dataset.from_csv(CFG.input_path + 'test.csv')

encoded_test = test.map(test_process, remove_columns= ['id', 'anchor', 'target', 'context'])

outputs = trainer.predict(encoded_test)
predictions = outputs.predictions.reshape(-1)


<font size=17> Make submission file </font>

In [None]:

submission = datasets.Dataset.from_dict({
    'id': test['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)