## Iterate like a grandmaster with submission plus miscellaneous

Based on the original [notebook](https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster/) by Jermey Howard, we look at adding in code to generate submissions and cleaning up the code a little to make it easier and faster to iterate. We then extending making use of the ideas suggested in the original notebook, and others that might come to mind.

Changes, done and in progress:
- [x] Add ability to create submission file
- [x] Update so that pretrained model is loaded from input (first submission scored 0.792)
- [x] Make notebook more concise by removing explanatory text and grouping code into workflow class
- [x] Update to model trained on full dataset submission submission (submission scored 0.799)
- [x] Round labels to closest multiple of 0.25 (dropped to 0.762) 
- [x] Clip values to [0, 1]  (appears to reduce score slightly)
- [x] Try [BERT for patents](https://huggingface.co/anferico/bert-for-patents) (scored 0.81, 0.82 after tuning LR and epochs)
- [ ] Implement cross validation in workflow
- [ ] Change to use one hot encoding for scores


Ideas suggested in the original notebook:

- Try a model pretrained on legal vocabulary. E.g. how about [BERT for patents](https://huggingface.co/anferico/bert-for-patents)?
- You'd likely get better results by using a sentence similarity model. Did you know that there's a [patent similarity model](https://huggingface.co/AI-Growth-Lab/PatentSBERTa) you could try?
- You could also fine-tune any HuggingFace model using the full patent database (which is provided in BigQuery), before applying it to this dataset
- Replace the patent context field with the description of that context provided by the patent office
- ...and try out your own ideas too!

## Import, set paths and load data

The first thing we do is import all the packages we are going to need and set the paths to point to the input data and any pretrained model files that will be used.

In [None]:
from fastai.imports import *
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
import datasets
from datasets import load_dataset, Dataset, DatasetDict

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
# set input and model paths
input_path = (Path('../input/us-patent-phrase-to-phrase-matching') if iskaggle
    else Path.home()/'data'/'us-patent-phrase-to-phrase-matching')
model_path = (Path('../input/download-pretrained-models') if iskaggle
    else Path('models'))

## Create a workflow class

Create a workflow class to tie together different methods, which can be extended upon and modified.

In [None]:
class Workflow:
        
    def __init__(self, model_nm, val_prop=0.25, seed=42, dev=False):
        self.model_nm = model_nm
        self.val_prop = val_prop
        self.seed = seed
        self.dev = dev
        self.sectoks = [] # additional section tokens
        
        # setup tokenizer and separator to use
        self.sep = " [s] "
        self.load_and_preprocess()
        self.init_tokenizer()
        
    def init_tokenizer(self):
        self.tokz = AutoTokenizer.from_pretrained(model_nm)
        self.tokz.add_special_tokens({'additional_special_tokens': self.sectoks})
        
    def load_and_preprocess(self):
        # load test and eval data
        self.train_df = pd.read_csv(input_path/'train.csv')
        if self.dev: self.train_df = self.train_df.iloc[:500]
        self.eval_df = pd.read_csv(input_path/'test.csv')
        
        self.preprocess_raw_data(self.train_df)
        self.preprocess_raw_data(self.eval_df)
        
    # create separate section column from first letter of the context
    def preprocess_raw_data(self, df):
        df['section'] = df['context'].str[0]
        df['sectok'] = '[' + df.section + ']'
        df['inputs'] = df.sectok + self.sep + df.context + self.sep + df.anchor + self.sep + df.target
        self.sectoks = list(set(list(df.sectok.unique()) + self.sectoks))
        
    def get_val_split(self):
        anchors = self.train_df.anchor.unique()
        np.random.seed(self.seed)
        np.random.shuffle(anchors)
        val_sz = int(len(anchors)*self.val_prop)
        val_anchors = anchors[:val_sz]
        is_val = np.isin(self.train_df.anchor, val_anchors)
        idxs = np.arange(len(self.train_df))
        val_idxs = idxs[ is_val]
        trn_idxs = idxs[~is_val]
        return val_idxs, trn_idxs
        
    def get_dds(self, df, no_val=False):
        ds = Dataset.from_pandas(df).rename_column('score', 'label')
        def tok_func(x): return self.tokz(x["inputs"])
        tok_ds = ds.map(tok_func, batched=True, remove_columns=('inputs','id','section'))
        if not no_val:
            self.val_idxs, self.trn_idxs = self.get_val_split()
            self.train_ds = tok_ds.select(self.trn_idxs)
            self.val_ds = tok_ds.select(self.val_idxs)
        else:
            self.val_idxs, self.trn_idxs = np.ones(0), np.arange(len(self.train_df))
            self.train_ds = tok_ds.select(self.trn_idxs)
            self.val_ds = []
        
        return DatasetDict({"train": self.train_ds, "test": self.val_ds})
    
    def train(self, no_val=False, **kwargs):
        dds = self.get_dds(self.train_df, no_val=no_val)
        self.trainer = self.get_trainer(dds, **kwargs)
        self.trainer.train()
        
    def get_model(self):        
        model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
        model.resize_token_embeddings(len(self.tokz))
        with torch.no_grad():
            model.get_input_embeddings().weight[-len(self.tokz), :] = torch.zeros([model.config.hidden_size])
        return model

    def get_trainer(self, dds, epochs=4, lr=8e-5, bs=128, wd=0.01):
        def corr(eval_pred): return {'pearson': np.corrcoef(eval_pred[0].flatten(), eval_pred[1].flatten())[0][1]}
        self.model = self.get_model()
        args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
            num_train_epochs=epochs, weight_decay=wd, report_to='none', save_steps=5000)
        return Trainer(self.model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                       tokenizer=self.tokz, compute_metrics=corr)
    
    def get_test_predictions(self):
        test_ds = Dataset.from_pandas(self.eval_df)
        def tok_func(x): return self.tokz(x["inputs"])
        test_ds = test_ds.map(tok_func, batched=True, remove_columns=('inputs','id','section'))
        return self.trainer.predict(test_ds)[0].flatten()
        
    
    def prepare_submission(self, save_fn='submission.csv', clip=False):
        prediction_results = self.get_test_predictions()
        if clip: prediction_results = np.clip(prediction_results, 0.0, 1.0)
        submission_df = pd.DataFrame({'id': self.eval_df['id'], 'score': prediction_results})
        submission_df.to_csv(save_fn, index=False)        
        
    def get_validation_score(self, clip=False):
        prediction_results = self.trainer.predict(self.val_ds)
        preds, labels = prediction_results[0].flatten(), prediction_results[1].flatten()
        if clip: preds = np.clip(preds, 0., 1.)
        return np.corrcoef(preds, labels)[0][1]
    
    def get_train_score(self, clip=False):
        prediction_results = self.trainer.predict(self.train_ds)
        preds, labels = prediction_results[0].flatten(), prediction_results[1].flatten()
        if clip: preds = np.clip(preds, 0., 1.)
        return np.corrcoef(preds, labels)[0][1]        

In [None]:
cached_model = True
# model_nm = model_path/'deberta-v3-small' if cached_model else 'microsoft/deberta-v3-small'
model_nm = model_path/'bert-for-patents' if cached_model else 'anferico/bert-for-patents'

First train with a train validation split to get idea of performance.

In [None]:
wf = Workflow(model_nm, dev=False)
wf.train(no_val=False, epochs=6, bs=64, lr=4e-5)
print(wf.get_validation_score(clip=False), wf.get_validation_score(clip=True))
print(wf.get_train_score(clip=False), wf.get_train_score(clip=True))
wf.prepare_submission()

Train on the full dataset to make use of the additional data.

In [None]:
wf = Workflow(model_nm, dev=False)
wf.train(no_val=True, epochs=6, bs=64, lr=4e-5)
wf.prepare_submission()