In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Installation and imports

In [None]:

import datasets,transformers

from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

# Config class

In [None]:
class config:
    
    input_path = '/kaggle/input/us-patent-phrase-to-phrase-matching/'
    model_path = '/kaggle/input/roberta-base'
    model = 'roberta-base'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    
    epochs = 5
    batch_size = 32

# Dictionary for sections

In [None]:
sections = {
 'A': 'Human Necessities',
 'B': 'Operations and Transport',
 'C': 'Chemistry and Metallurgy',
 'D': 'Textiles',
 'E': 'Fixed Constructions',
 'F': 'Mechanical Engineering',
 'G': 'Physics',
 'H': 'Electricity',
 'Y': 'Emerging Cross-Sectional Technologies'
}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config.model_path, num_labels = 1)

tokenizer = AutoTokenizer.from_pretrained(config.model_path)

In [None]:
df_train = datasets.Dataset.from_csv(config.input_path + 'train.csv')
df_train

In [None]:
def process(unit, eval = False):
    sig = unit['context'][0]
    prefix = sections[sig]
    text = unit['anchor']
    
    return {
        **tokenizer(prefix+text, unit['target'],),
        'label':unit['score']
    }

encoded_ds = df_train.map(process, remove_columns = ['id', 'anchor', 'target', 'context', 'score'])

In [None]:
encoded_ds[100]

In [None]:
encoded_ds = encoded_ds.train_test_split(test_size = 0.2)

# Training Setup

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    
    return {
        
        'pearson' : np.corrcoef(predictions, labels)[0][1]
    }

args = TrainingArguments(
    
    f"uspppm",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = config.learning_rate,
    per_device_train_batch_size = config.batch_size,
    per_device_eval_batch_size = config.batch_size,
    num_train_epochs = config.epochs,
    weight_decay = config.weight_decay,
    load_best_model_at_end = True,
    metric_for_best_model = "pearson"
)


trainer = Trainer(
    model,
    args,
    train_dataset = encoded_ds['train'],
    eval_dataset = encoded_ds['test'],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

# Evaluation

In [None]:
trainer.evaluate()

# Training

In [None]:
trainer.train()

# Prediction

In [None]:
def test_process(unit, eval = False):
    sig = unit['context'][0]
    prefix = sections[sig]
    text = unit['anchor']
    
    return {
        **tokenizer(prefix+text, unit['target']),
        'label':-1
    }

test = datasets.Dataset.from_csv(config.input_path +'test.csv')
encoded_test = test.map(test_process, remove_columns = ['id', 'anchor', 'target', 'context'])

outputs = trainer.predict(encoded_test)
predictions = outputs.predictions.reshape(-1)

# Submission

In [None]:
submission = datasets.Dataset.from_dict(
    {
        'id' : test['id'],
        'score' : predictions
    }
)

submission.to_csv('submission.csv', index = False)