In [None]:
import os
import pandas as pd
import numpy as np
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
os.environ["WANDB_DISABLED"] = "true"

In [None]:
df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')

In [None]:
df_context = pd.read_csv('../input/cpc-codes/titles.csv')

In [None]:
df = df.merge(df_context, how='left', left_on='context', right_on='code')
df = df[['id', 'anchor', 'target', 'context', 'title', 'score']]
df

In [None]:
eval_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
eval_df = eval_df.merge(df_context, how='left', left_on='context', right_on='code')
eval_df = eval_df[['id', 'anchor', 'target', 'context', 'title']]
eval_df

In [None]:
model_nm = '../input/debertav3small'

In [None]:
from transformers import AutoTokenizer
tonkenizer = AutoTokenizer.from_pretrained(model_nm)

In [None]:
df['input'] = df['target'] + tonkenizer.sep_token + df['title'].apply(str.lower)

In [None]:
df

In [None]:
eval_df['input'] = eval_df['target'] + tonkenizer.sep_token + eval_df['title'].apply(str.lower)

In [None]:
eval_df

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TrainDataset(Dataset):
    def __init__(self, df):
        self.input = df['input'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        self.label = df['score'].values
        
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        label = self.label[item]
        
        model_inputs = tonkenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs,
               'label':torch.as_tensor(label, dtype=torch.float)}
class evalDataset(Dataset):
    def __init__(self, df):
        self.input = df['input'].values.astype(str)
        self.anchor = df['anchor'].values.astype(str)
        
    def __len__(self):
        return len(self.input)
    
    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        
        model_inputs = tonkenizer(inputs, anchor,
                                max_length=100,
                                padding='max_length',
                                truncation=True)
        
        return {**model_inputs}

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

In [None]:
metric_name = 'pearson'
batch_size = 128
args = TrainingArguments(
       'model_test',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit=1
)

In [None]:
train_dataset = TrainDataset(train_df)
val_dataset = TrainDataset(val_df)

trainer = Trainer(model,
                  args,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  tokenizer=tonkenizer,
                  compute_metrics=compute_metrics
                 )

In [None]:
trainer.train()

In [None]:
va_dataset = evalDataset(eval_df)

In [None]:
outputs = trainer.predict(va_dataset).predictions.astype(float)
outputs

In [None]:
outputs = np.clip(outputs, 0, 1)

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_df['id'],
    'score': outputs.flatten()
})

submission.to_csv('submission.csv', index=False)