## patent
> Module for training patent phrase matching 

In [None]:
#| default_exp patent_phrase_matching.train

In [None]:
#|export
from pathlib import Path
import os
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
#|eval: false
import kaggle
import zipfile

In [None]:
#|eval: false
path = Path(f'{os.getenv("DATA_BASE_DIR")}/us-patent-phrase-to-phrase-matching')
!ls {path}

In [None]:
#|eval: false
#!kaggle competitions download us-patent-phrase-to-phrase-matching -p {path}
# zipfile.ZipFile(f'{path}/{Path("us-patent-phrase-to-phrase-matching")}.zip').extractall(path)

In [None]:
#|export
import pandas as pd
from datasets import Dataset, DatasetDict

In [None]:
#|eval: false
df = pd.read_csv(f"{path}/train.csv")
df.head()

In [None]:
#|eval: false
df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [None]:
#|eval: false
df["score"].nunique()

5

In [None]:
#|eval: false
df["score"].unique()

array([0.5 , 0.75, 0.25, 0.  , 1.  ])

The scores are values between 0 and 1, hence this can be modeled as a single class prediction problem

In [None]:
#|export
import os
import torch
import wandb
import argparse
import numpy as np
from functools import partial
from typing import Optional
from dataclasses import asdict, dataclass, field
from transformers import HfArgumentParser
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
#|eval: false
model_ckpt = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, cache_dir=os.getenv("HF_HUB_CACHE"))



In [None]:
#|eval: false
tokenizer.sep_token

'[SEP]'

In [None]:
#|eval: false
df["text"] = df["anchor"] + tokenizer.sep_token + df['target'] + tokenizer.sep_token + df['context']
df.rename(columns={"score":"label"}, inplace=True)

In [None]:
#|eval: false
anchors = df.anchor.unique()
np.random.seed(42)
np.random.shuffle(anchors)

In [None]:
#|eval: false
val_prop = 0.25
val_sz = int(len(anchors)*val_prop)
val_anchors = anchors[:val_sz]

In [None]:
#|eval: false
train_df = df[~df['anchor'].isin(val_anchors)]
val_df = df[df['anchor'].isin(val_anchors)]

In [None]:
#|eval: false
train_df.shape, val_df.shape

((27357, 6), (9116, 6))

In [None]:
#|eval: false
train_df['label'].mean(), val_df['label'].mean()

(0.3623021530138539, 0.3613426941641071)

In [None]:
#|eval: false
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [None]:
#|eval: false
test_data = pd.read_csv(f"{path}/test.csv")
test_data["text"] = test_data["anchor"] + tokenizer.sep_token + test_data['target'] + tokenizer.sep_token + test_data['context']

In [None]:
#|eval: false
test_ds = Dataset.from_pandas(test_data)

In [None]:
#|eval: false
patents = DatasetDict({"train": train_ds,
                       "validation":val_ds,
                       "test": test_ds
                      })

In [None]:
#|eval: false
patents['train'].features

{'id': Value(dtype='string', id=None),
 'anchor': Value(dtype='string', id=None),
 'target': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'label': Value(dtype='float64', id=None),
 'text': Value(dtype='string', id=None),
 '__index_level_0__': Value(dtype='int64', id=None)}

In [None]:
#|eval: false
wandb.login()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /homes/roshan/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mroshkjr[0m ([33mroshkjr-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
#|eval: false
%env WANDB_PROJECT="us-patent-phrase-to-phrase-matching"

env: WANDB_PROJECT="us-patent-phrase-to-phrase-matching"


In [None]:
#|eval: false
patents_encoded.set_format("torch") #setting the format to torch so that we can use to(dvice) of torch

In [None]:
#|eval: false
def get_output(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.inference_mode():
        outputs = model(**inputs)
    return outputs

In [None]:
#|eval: false
from omegaconf import OmegaConf
import json

In [None]:
#|eval: false
with open("../configs/patent_phrase_matching/config.json", 'r') as fh:
    conf = OmegaConf.create(json.load(fh))

In [None]:
#|eval: false
print(OmegaConf.to_yaml(conf))

per_device_train_batch_size: 8
per_device_eval_batch_size: 8
wandb_project: patent_phrase_matching
wandb_job_type: Seq2Class
lr_scheduler_type: cosine
eval_strategy: epoch
model_name: microsoft/deberta-v3-small
dtype: float32
text_column: text



In [None]:
#| export
def compute_metrics(pred):
    return {'pearson': np.corrcoef(*pred)[0][1]}

In [None]:
#| export
@dataclass
class HfModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name: Optional[str] = field(
        metadata={
            "help": "The model checkpoint for weights initialization. "
            "Don't set if you want to train a model from scratch. "
            "W&B artifact references are supported in addition to the sources supported by `PreTrainedModel`."
        },
    )
    num_labels: int = field(
        metadata={"help": "Number of labels to classify"},
    )
    dropout: Optional[float] = field(
        default=None,
        metadata={"help": "Dropout rate. Overwrites config."},
    )
    activation_dropout: Optional[float] = field(
        default=None,
        metadata={"help": "Activation dropout rate. Overwrites config."},
    )
    attention_dropout: Optional[float] = field(
        default=None,
        metadata={"help": "Attention dropout rate. Overwrites config."},
    )


In [None]:
#| export
@dataclass
class HfDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    text_column: Optional[str] = field(
        metadata={
            "help": "The name of the column in the datasets containing the full texts (for summarization)."
        },
    )
    filter_column: Optional[str] = field(
        default=None,
        metadata={"help": "Column that containts classes to be filtered."},
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples."
        },
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={
            "help": "The number of processes to use for the preprocessing. Not used in streaming mode."
        },
    )

In [None]:
#| export
@dataclass
class HfTrainingArguments:
    """
    Arguments pertaining to training parameters.
    """
    output_dir: str = field(
        metadata={
            "help": "The output directory where the model predictions and checkpoints will be written."
        },
    )
    batch_size: int = field(
        metadata={
            "help": "The size of batch"
        },
    )
    epochs: int = field(
        metadata={
            "help": "The number of epochs to run"
        },
    )
    warmup_ratio: float = field(
        metadata={"help":"Warmup ratio to use"}
    )
    optimizer: str = field(
        metadata={
            "help": 'The optimizer to use. Can be "adam" or "adafactor"'
        },
    )
    eval_strategy: str = field(
        metadata={"help": 'The srategy for evaluation'}
    )
    weight_decay: float = field(
        metadata={"help": "Weight decay applied to parameters."}
    )
    num_train_epochs: int = field(
        metadata={"help": "Total number of training epochs to perform."}
    )
    per_device_train_batch_size: int = field(
        metadata={"help": "Batch size per data parallel device for training."},
    )
    per_device_eval_batch_size: Optional[int] = field(
        metadata={
            "help": "Batch size per data parallel device for evaluation. Same as training batch size if not set."
        },
    )
    learning_rate: float = field(
        metadata={"help": "The initial learning rate."}
    )
    lr_scheduler_type: str = field(
        metadata={"help":"The learning rate scheduler type"}
    )
    wandb_project: str = field(
        metadata={"help": "The name of the wandb project."},
    )
    wandb_job_type: str = field(
        metadata={"help": "The name of the wandb job type."},
    )
    overwrite_output_dir: bool = field(
        default=False,
        metadata={
            "help": (
                "Overwrite the content of the output directory. "
                "Use this to continue training if output_dir points to a checkpoint directory."
            )
        },
    )
    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(
        default=False, metadata={"help": "Whether to run eval on the validation set."}
    )
    seed_model: int = field(
        default=42,
        metadata={
            "help": "Random seed for the model that will be set at the beginning of training."
        },
    )

    def __post_init__(self):
        assert self.optimizer in [
            "adam",
            "adafactor",
        ], f"Selected optimizer not supported: {self.optim}"
        if self.optimizer == "adafactor" and self.weight_decay == 0:
            self.weight_decay = None
        if self.per_device_eval_batch_size is None:
            self.per_device_eval_batch_size = self.per_device_train_batch_size
        if not self.do_train:
            self.num_train_epochs = 1
        if (
            os.path.exists(self.output_dir)
            and os.listdir(self.output_dir)
            and self.do_train
            and not self.overwrite_output_dir
        ):
            raise ValueError(
                f"Output directory ({self.output_dir}) already exists and is not empty."
                "Use --overwrite_output_dir to overcome."
            )



In [None]:
#| export
def create_parser():
    parser = argparse.ArgumentParser(
        prog="patent_phrase_matching",
        description='train patent_phrase_matching',
    )
    parser.add_argument('--config',
                       default="../configs/patent_phrase_matching/config.json")
    return parser

In [None]:
#| export
def tokenize(batch, tokenizer, data_args):
    return tokenizer(batch[data_args.text_column], padding=True, truncation=True)

In [None]:
#| export
def get_dds(df, separator, train_idx, val_idx, tokenizer, text_column):
    df["text"] = df["anchor"] + separator + df['target'] + separator + df['context']
    ds = Dataset.from_pandas(df)
    ds_encoded = ds.map(partial(tokenize, tokenizer=tokenizer, text_column=text_column), batched=True)
    return DatasetDict({"train":ds_encoded.select(train_idx), "val": ds_encoded.select(val_idx)})

In [None]:
#| export
def main():
    hf_parser = HfArgumentParser(
        (HfModelArguments, HfDataTrainingArguments, HfTrainingArguments)
    )
    parser = create_parser()
    args = parser.parse_args(['--config', '../configs/patent_phrase_matching/config.json'])
    model_args, data_args, train_args = hf_parser.parse_json_file(
        json_file=args.config
    )
    
    path = Path(f'{os.getenv("DATA_BASE_DIR")}/us-patent-phrase-to-phrase-matching')
    df = pd.read_csv(f"{path}/train.csv").rename(columns={"score": "label"})
    
    anchors = df.anchor.unique()
    np.random.seed(42)
    np.random.shuffle(anchors)
    
    val_prop = 0.25
    val_sz = int(len(anchors) * val_prop)
    val_anchors = anchors[:val_sz]
    is_val = np.isin(df.anchor, val_anchors)
    idxs = np.arange(len(df))
    val_idxs = idxs[is_val]
    trn_idxs = idxs[~is_val]
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name,
        cache_dir=os.getenv("HF_HUB_CACHE")
    )
    separator = tokenizer.sep_token
    dds = get_dds(df, separator, trn_idxs, val_idxs, tokenizer, data_args.text_column)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = (
        AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name,
            num_labels=model_args.num_labels,
            cache_dir=os.getenv("HF_HUB_CACHE")
        )
    ).to(device)
    
    logging_steps = len(dds["train"]) // train_args.batch_size
    model_name = f"{model_args.model_name}-finetuned-patents"
    output_dir = f"{os.getenv('MODEL_BASE_DIR')}/model_name"
    
    training_args = TrainingArguments(
        output_dir=train_args.output_dir, 
        num_train_epochs=train_args.epochs,
        learning_rate=train_args.learning_rate,
        warmup_ratio=train_args.warmup_ratio,
        lr_scheduler_type=train_args.lr_scheduler_type,
        fp16=True,
        per_device_train_batch_size=train_args.per_device_train_batch_size,
        per_device_eval_batch_size=train_args.per_device_eval_batch_size,
        weight_decay=train_args.weight_decay, 
        eval_strategy=train_args.eval_strategy,
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dds["train"],
        eval_dataset=dds["val"],
        compute_metrics=compute_metrics,
        processing_class=tokenizer
    )
    
    trainer.train()
