In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
import numpy as np

In [2]:
df = pd.read_csv("us-patent-phrase-matching/train.csv")
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [3]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,8d135da0b55b8c88,component composite coating,composition,H01
freq,1,152,24,2186


In [4]:
df.describe()

Unnamed: 0,score
count,36473.0
mean,0.362062
std,0.258335
min,0.0
25%,0.25
50%,0.25
75%,0.5
max,1.0


In [5]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

In [6]:
df

Unnamed: 0,id,anchor,target,context,score,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,TEXT1: A47; TEXT2: abatement of pollution; ANC...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,TEXT1: A47; TEXT2: forest region; ANC1: abatement
...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00,TEXT1: B44; TEXT2: wooden article; ANC1: wood ...
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50,TEXT1: B44; TEXT2: wooden box; ANC1: wood article
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50,TEXT1: B44; TEXT2: wooden handle; ANC1: wood a...
36471,756ec035e694722b,wood article,wooden material,B44,0.75,TEXT1: B44; TEXT2: wooden material; ANC1: wood...


In [7]:
eval_df = pd.read_csv("us-patent-phrase-matching/test.csv")
eval_df

Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04
5,474c874d0c07bd21,dry corn,dry corn starch,C12
6,442c114ed5c4e3c9,tunneling capacitor,capacitor housing,G11
7,b8ae62ea5e1d8bdb,angular contact bearing,contact therapy radiation,B23
8,faaddaf8fcba8a3f,produce liquid hydrocarbons,produce a treated stream,C10
9,ae0262c02566d2ce,diesel fuel tank,diesel fuel tanks,F02


In [8]:
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor

## Tokenization

Breaking down of words into smaller units to reduce the vocabulary. Having a smaller vocabulary improves training performance.

## Numericalization

Convert each word (or token) into a number.

During tokenization, lots of small decisions have to be made. Before tokenizing, decide the model to use and use the tokenizer the model used. Choosing different tokenizer than the model is problematic.

`AutoTokenizer` will create a tokenizer appropriate for a given model.

In [9]:
ds = Dataset.from_pandas(df)

In [10]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [11]:
model_nm = 'microsoft/deberta-v3-small'

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)



In [13]:
tokz.tokenize("G'day folks, I'm Pranit Bauva learning fast.ai!")

['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁Pran',
 'it',
 '▁Bau',
 'va',
 '▁learning',
 '▁fast',
 '.',
 'ai',
 '!']

In [14]:
tokz.tokenize("A playypus is an ornithorhynchus anatinus.")

['▁A',
 '▁play',
 'y',
 'pus',
 '▁is',
 '▁an',
 '▁or',
 'ni',
 'tho',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [15]:
def tok_func(x): return tokz(x["input"])

In [16]:
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

In [17]:
tok_ds = tok_ds.rename_columns({'score': 'labels'})

In [18]:
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [19]:
row = tok_ds[0]
row

{'id': '37d61fd2272659b1',
 'anchor': 'abatement',
 'target': 'abatement of pollution',
 'context': 'A47',
 'labels': 0.5,
 'input': 'TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 'input_ids': [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [21]:
def corr(x,y): return np.corrcoef(x,y)[0][1]

In [22]:
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [23]:
from transformers import TrainingArguments, Trainer

In [24]:
bs = 256
epochs = 1

In [25]:
lr = 8e-5

In [26]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')



In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
trainer.train();

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027378,0.778797


In [29]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[ 0.43260217],
       [ 0.86026436],
       [ 0.40901625],
       [ 0.4332892 ],
       [ 0.20906103],
       [ 0.34490952],
       [ 0.35938129],
       [ 0.08576425],
       [ 0.17972946],
       [ 1.03929448],
       [ 0.2243387 ],
       [ 0.32809058],
       [ 0.67288905],
       [ 0.68392324],
       [ 0.85768217],
       [ 0.42931813],
       [ 0.11856277],
       [ 0.1354315 ],
       [ 0.48871291],
       [ 0.19805408],
       [ 0.25144431],
       [ 0.1753495 ],
       [ 0.18612821],
       [ 0.12598819],
       [ 0.50026351],
       [-0.12598921],
       [-0.00143942],
       [ 0.07288562],
       [ 0.06400194],
       [ 0.71292359],
       [ 0.35844398],
       [-0.01588291],
       [ 0.71205127],
       [ 0.39477494],
       [ 0.41218311],
       [ 0.1674712 ]])

In [30]:
preds = np.clip(preds, 0, 1)

In [31]:
preds

array([[0.43260217],
       [0.86026436],
       [0.40901625],
       [0.4332892 ],
       [0.20906103],
       [0.34490952],
       [0.35938129],
       [0.08576425],
       [0.17972946],
       [1.        ],
       [0.2243387 ],
       [0.32809058],
       [0.67288905],
       [0.68392324],
       [0.85768217],
       [0.42931813],
       [0.11856277],
       [0.1354315 ],
       [0.48871291],
       [0.19805408],
       [0.25144431],
       [0.1753495 ],
       [0.18612821],
       [0.12598819],
       [0.50026351],
       [0.        ],
       [0.        ],
       [0.07288562],
       [0.06400194],
       [0.71292359],
       [0.35844398],
       [0.        ],
       [0.71205127],
       [0.39477494],
       [0.41218311],
       [0.1674712 ]])