# Loading Data

In [None]:

import os
from pathlib import Path
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets

In [None]:
path = Path('us-patent-phrase-to-phrase-matching')


if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    
print(path)  
!ls {path}

1. Load CSV Data with Pandas

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

df = pd.read_csv(path / 'train.csv')
df.describe(include=['object', np.number])

In [None]:
def append_input(df):
    df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor
    return df

df = append_input(df)
df.head()

2. Tokenize and Numericalize

In [None]:
from transformers import AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_nm)

3. Convert Training data into Transformer's Dataset

In [None]:
import datasets
ds = datasets.Dataset.from_pandas(df).map(lambda x: tokenizer(x['input']), batched = True)
ds = ds.rename_columns({'score': 'labels'})
datasets_dict = ds.train_test_split(0.25, seed = 42)

In [None]:
datasets_dict

# Train Model with Transformer API

1. Initiate TrainArgument

In [None]:
from transformers import TrainingArguments

bs = 128
lr = 8e-5
epochs = 4
args  =  TrainingArguments(output_dir='outputs', evaluation_strategy='epoch', per_device_train_batch_size= bs, per_device_eval_batch_size= 2 * bs, learning_rate=lr, weight_decay=0.01, num_train_epochs=epochs, lr_scheduler_type='cosine', warmup_ratio=0.1, fp16=True, report_to='none')

2. Initiate a Model

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 1)

3. Initiate a Trainer

In [None]:
from transformers import Trainer
def corr(x, y): return np.corrcoef(x, y)[0][1]
def corr_ep(eval_preds):
    print(f'eval_preds: {eval_preds}')
    print(f'pearson: {corr(*eval_preds)}')
    return { 'Pearson': corr(eval_preds.predictions, eval_preds.label_ids) }
trainer = Trainer(model, args, train_dataset=datasets_dict['train'], eval_dataset=datasets_dict['test'],
                  tokenizer=tokenizer, compute_metrics=corr_ep)

4. Train Model

In [None]:
trainer.train()

# Test Model

1. Loading Evaluation Data

In [None]:
eval_df = pd.read_csv(path / 'test.csv')
eval_df = append_input(eval_df)
eval_ds = datasets.Dataset.from_dict(eval_df).map(lambda x: tokenizer(x['input']))
eval_row = eval_ds[0]
eval_row['input'], eval_row['input_ids']

2. Predict with Model

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
preds, type(preds)

3. Submit Predictions

In [None]:
submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'], 'score': preds
})
submission.to_csv('submission_v4.csv', index = False)