In [1]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [2]:
!pip install kaggle



In [3]:
creds = ''

In [4]:
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [5]:
path = Path('us-patent-phrase-to-phrase-matching')

if not iskaggle and not path.exists():
    import zipfile,kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

In [6]:
if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    ! pip install -q datasets

In [7]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [8]:
import pandas as pd
df = pd.read_csv(path/'train.csv')
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


##Dataset Description
In this dataset, you are presented pairs of phrases (an anchor and a target phrase) and asked to rate how similar they are on a scale from 0 (not at all similar) to 1 (identical in meaning). This challenge differs from a standard semantic similarity task in that similarity has been scored here within a patent's context, specifically its CPC classification (version 2021.05), which indicates the subject to which the patent relates. For example, while the phrases "bird" and "Cape Cod" may have low semantic similarity in normal language, the likeness of their meaning is much closer if considered in the context of "house".

##Score meanings
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

* 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
* 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
* 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
* 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
* 0.0 - Unrelated.

##Files
* train.csv - the training set, containing phrases, contexts, and their similarity scores
* test.csv - the test set set, identical in structure to the training set but without the score
* sample_submission.csv - a sample submission file in the correct format

##Columns
* id - a unique identifier for a pair of phrases
* anchor - the first phrase
* target - the second phrase
* context - the CPC classification (version 2021.05), which indicates the  subject within which the similarity is to be scored
* score - the similarity. This is sourced from a combination of one or more manual expert ratings.

**"Google Patent Phrase Similarity Dataset" by Google is licensed under a Creative Commons Attribution 4.0 International License (CC BY 4.0)**

In [9]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [10]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor
df.input.head()

Unnamed: 0,input
0,TEXT1: A47; TEXT2: abatement of pollution; ANC...
1,TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2,TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3,TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4,TEXT1: A47; TEXT2: forest region; ANC1: abatement


In [11]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [12]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokz = AutoTokenizer.from_pretrained(model_nm)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
tokz.tokenize("G'day folks, I'm Jeremy from fast.ai!")

['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁Jeremy',
 '▁from',
 '▁fast',
 '.',
 'ai',
 '!']

In [14]:
tokz.tokenize("A platypus is an ornithorhynchus anatinus.")

['▁A',
 '▁platypus',
 '▁is',
 '▁an',
 '▁or',
 'ni',
 'tho',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [15]:
def tok_func(x): return tokz(x["input"])

In [16]:
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

In [17]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [18]:
tokz.vocab['▁of']

265

In [19]:
tokz.tokenize('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement')

['▁TEXT',
 '1',
 ':',
 '▁A',
 '47',
 ';',
 '▁TEXT',
 '2',
 ':',
 '▁abatement',
 '▁of',
 '▁pollution',
 ';',
 '▁ANC',
 '1',
 ':',
 '▁abatement']

We can see vocab of '_of' is in the same place as it is in the tokenized form

In [20]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [21]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [22]:
eval_df = pd.read_csv(path/'test.csv')
eval_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,hybrid bearing,inorganic photoconductor drum,G02
freq,1,2,1,3


In [23]:
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [24]:
from transformers import TrainingArguments,Trainer
bs = 128
epochs = 4
lr = 8e-5

In [25]:
import numpy as np
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

| **Param**                     | **Param Explanation**                | **Value from your Code** | **Explanation of Value**                       |
| ----------------------------- | ------------------------------------ | ------------------------ | ---------------------------------------------- |
| `output_dir`                  | Folder to save checkpoints and logs. | `'outputs'`              | Saves all outputs in the *outputs* folder.     |
| `learning_rate`               | Step size for weight updates.        | `lr`                     | Uses user-defined learning rate.               |
| `warmup_ratio`                | % of steps for LR warmup.            | `0.1`                    | Gradually increases LR for first 10% of steps. |
| `lr_scheduler_type`           | LR adjustment method.                | `'cosine'`               | Uses cosine decay for smooth LR reduction.     |
| `fp16`                        | Enables mixed precision.             | `True`                   | Speeds up and saves memory on GPU.             |
| `eval_strategy`         | When to evaluate.                    | `'epoch'`                | Evaluates after each epoch.                    |
| `per_device_train_batch_size` | Training batch size per device.      | `bs`                     | Uses `bs` as batch size.                       |
| `per_device_eval_batch_size`  | Eval batch size per device.          | `bs * 2`                 | Double the train batch for faster eval.        |
| `num_train_epochs`            | Number of full dataset passes.       | `epochs`                 | Runs for given number of epochs.               |
| `weight_decay`                | Regularization factor.               | `0.01`                   | Adds slight L2 penalty to avoid overfitting.   |
| `report_to`                   | Logging destination.                 | `'none'`                 | Disables external log reporting.               |


In [26]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    eval_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],


In [28]:
trainer.train();

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.033491,0.793822
2,No log,0.023393,0.817356
3,0.046900,0.023265,0.826892
4,0.046900,0.023747,0.828388


In [29]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[ 5.88378906e-01],
       [ 6.67968750e-01],
       [ 5.94726562e-01],
       [ 3.40332031e-01],
       [-2.81982422e-02],
       [ 5.12207031e-01],
       [ 5.37109375e-01],
       [-2.39257812e-02],
       [ 2.41455078e-01],
       [ 1.10351562e+00],
       [ 2.64404297e-01],
       [ 2.89062500e-01],
       [ 7.87109375e-01],
       [ 7.78320312e-01],
       [ 7.48046875e-01],
       [ 5.06835938e-01],
       [ 2.32055664e-01],
       [-8.09669495e-04],
       [ 6.20605469e-01],
       [ 3.08105469e-01],
       [ 4.91943359e-01],
       [ 2.61962891e-01],
       [ 1.80969238e-02],
       [ 2.45239258e-01],
       [ 5.83007812e-01],
       [-1.98211670e-02],
       [-1.93176270e-02],
       [-5.84411621e-03],
       [-2.51770020e-02],
       [ 6.53320312e-01],
       [ 3.18115234e-01],
       [-1.99584961e-02],
       [ 7.29003906e-01],
       [ 5.26855469e-01],
       [ 4.42871094e-01],
       [ 2.13134766e-01]])

In [30]:
preds = np.clip(preds, 0, 1)
preds

array([[0.58837891],
       [0.66796875],
       [0.59472656],
       [0.34033203],
       [0.        ],
       [0.51220703],
       [0.53710938],
       [0.        ],
       [0.24145508],
       [1.        ],
       [0.2644043 ],
       [0.2890625 ],
       [0.78710938],
       [0.77832031],
       [0.74804688],
       [0.50683594],
       [0.23205566],
       [0.        ],
       [0.62060547],
       [0.30810547],
       [0.49194336],
       [0.26196289],
       [0.01809692],
       [0.24523926],
       [0.58300781],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.65332031],
       [0.31811523],
       [0.        ],
       [0.72900391],
       [0.52685547],
       [0.44287109],
       [0.21313477]])

In [31]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

submission

Dataset({
    features: ['id', 'score'],
    num_rows: 36
})