# BERT for Patents Baseline

- [kfold strategy](https://www.kaggle.com/code/abhishek/phrase-matching-folds)
- Utilize [Cooperative Patent Classification Codes Meaning](https://www.kaggle.com/datasets/xhlulu/cpc-codes)
- reference [phantivia'Notebook](https://www.kaggle.com/code/phantivia/uspppm-huggingface-train-inference-baseline)
- [BERT for Patents](https://www.kaggle.com/datasets/ksork6s4/bert-for-patents) from [huggingface page](https://huggingface.co/anferico/bert-for-patents)


### Please refer to [Inference Notebook](https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-inference/edit/run/91272728) as well.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

import torch
from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

os.environ["WANDB_DISABLED"] = "true"

# Config

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    models_paths = ['../input/robertalarge', '../input/deberta-v3-large/deberta-v3-large']
    models_weights = [0.7, 0.3]

    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 4
    epochs = 4
    batch_sizes = [16, 16]

# Preproc

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
train_df['input'] = train_df['title']+' '+train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

# Tokenizer

In [None]:
tokenizers = [AutoTokenizer.from_pretrained(model_path) for model_path in CFG.models_paths]

# Dataset

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **self.tokenizer(inputs, targets),
        'label':label.astype(np.float32)
    }

# Train

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
output_df = pd.DataFrame()

In [None]:
for fold in range(CFG.num_fold):
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)

    predictions = np.zeros(va_data.shape[0])
    
    for model_idx in range(len(CFG.models_paths)):
        tr_dataset = TrainDataset(tr_data, tokenizers[model_idx])
        va_dataset = TrainDataset(va_data, tokenizers[model_idx])
        
        args = TrainingArguments(
            output_dir=f"/tmp/models",
            evaluation_strategy="epoch",
            save_strategy="no",
            learning_rate=CFG.learning_rate,
            per_device_train_batch_size=CFG.batch_sizes[model_idx],
            per_device_eval_batch_size=CFG.batch_sizes[model_idx],
            num_train_epochs=CFG.epochs,
            weight_decay=CFG.weight_decay,
        )
        
        model = AutoModelForSequenceClassification.from_pretrained(CFG.models_paths[model_idx], num_labels=1)
        trainer = Trainer(
            model,
            args,
            train_dataset=tr_dataset,
            eval_dataset=va_dataset,
            tokenizer=tokenizers[model_idx],
            compute_metrics=compute_metrics
        )
        
        trainer.train()
        shutil.rmtree(f"/tmp/models")
        # trainer.save_model(f"models_{fold}")
        
        outputs = trainer.predict(va_dataset)
        predictions += outputs.predictions.reshape(-1) * CFG.models_weights[model_idx]

    va_data['preds'] = predictions
    output_df = pd.concat([output_df, va_data])

In [None]:
predictions = output_df['preds'].values
label = output_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

In [None]:
almost_ready_df = output_df.drop(output_df.columns.difference(['id','preds']), 1, inplace=False).rename(columns={'preds': 'score'})

almost_ready_truncated_df = almost_ready_df
almost_ready_truncated_df['score'] = np.where(almost_ready_truncated_df['score'] > 0, almost_ready_truncated_df['score'], 0)
almost_ready_truncated_df['score'] = np.where(almost_ready_truncated_df['score'] < 1, almost_ready_truncated_df['score'], 1)

almost_ready_truncated_df['score'] = [min([0.0, 0.25, 0.5, 0.75, 1.0], key=lambda x: abs(x - pred_val)) for pred_val in almost_ready_truncated_df['score']]

almost_ready_truncated_df

In [None]:
# almost_ready_df.to_csv('submission.csv', index=False)
almost_ready_truncated_df.to_csv('submission.csv', index=False)