Great Notebooks: <br>
https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners/notebook <br>
https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster


In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments,Trainer, BertForSequenceClassification, BertTokenizer
import datasets
from datasets import Dataset,DatasetDict

### **CFG**

In [None]:
class CFG:
    model= '../input/deberta-v3-base/deberta-v3-base'
    batch_size= 128
    learning_rate = 8e-5
    epochs = 4
    weight_decay = 0.01

**Import data**

In [None]:
train = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')

cpc_codes = pd.read_csv('/kaggle/input/cpc-codes/titles.csv')

**Exploratory Data Analysis**

In [None]:
print("Number of observations in TRAIN: "+str(train.shape))
print("Number of observations in TEST: "+str(test.shape))

In [None]:
# No missing values
train.isnull().sum()

In [None]:
# Summary of categorical variables
train.describe(include = 'object')


In [None]:
train.head()

In [None]:
# 5 unique scores only 
plt.figure(figsize=(6,6))  # for later make much more appealing plot
sns.histplot(data = train, x = 'score')
plt.show()

**Some feature engineering**


Anchor:


In [None]:
# #number of words
# train['anchor_len'] = train['anchor'].str.split().str.len()

In [None]:
# train.anchor_len.value_counts()
# # The number of words in Anchor column ranges from 1 - 5 . 

Target:

In [None]:
# #number of words
# train['target_len'] = train['target'].str.split().str.len()

In [None]:
# train.target_len.value_counts()
# # The number of words in Target column ranges from 2 - 11 . 

Context : <br> 
<br>
A: Human Necessities <br> 
B: Operations and Transport <br> 
C: Chemistry and Metallurgy <br> 
D: Textiles <br> 
E: Fixed Constructions <br> 
F: Mechanical Engineering <br> 
G: Physics <br> 
H: Electricity <br> 
Y: Emerging Cross-Sectional Technologies

In [None]:
# separate context into section and classes
train['section'] = train['context'].astype(str).str[0]
train['classes'] = train['context'].astype(str).str[1:]

# separate context into section and classes
test['section'] = test['context'].astype(str).str[0]
test['classes'] = test['context'].astype(str).str[1:]

In [None]:
# add titles from context 

train['title'] = train['context'].map(cpc_codes.set_index('code')['title']).str.lower()
test['title'] = test['context'].map(cpc_codes.set_index('code')['title']).str.lower()

In [None]:
train.head()

**Tokenization**

need numbers as inputs

In [None]:
#!pip install datasets

In [None]:
#!pip install transformers
#!pip install sentencepiece

In [None]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

#retrieve relevant model 
#AutoTokenizer will create a tokenizer appropriate for a given model:
tokz = AutoTokenizer.from_pretrained(CFG.model)



In [None]:
## Special Tokens

train['sectok'] = '[' + train.section + ']'
test['sectok'] = '[' + test.section + ']'

sectoks = list(train.sectok.unique())
tokz.add_special_tokens({'additional_special_tokens': sectoks})



In [None]:
#combine context title, context, and anchor by using sep
sep = tokz.sep_token

train['inputs'] = train.sectok + sep + train.context + sep + train.anchor + sep + train.target
test['inputs'] = test.sectok + sep + test.context + sep + test.anchor + sep + test.target


In [None]:

# convert pandas DataFrames into HuggingFace Datasets
# Transformers expects "label" for dependent variable
train_ds = Dataset.from_pandas(train).rename_column('score', 'label')
test_ds = Dataset.from_pandas(test)

In [None]:
def tok_func(x): 
    return tokz(x["inputs"])  #input column has the sentence 

In [None]:
# testing, we can see token 1 (start of text), followed by 2s
tok_func(train_ds[0])

In [None]:
train[:3]

In [None]:
#tokenize the input + remove unnecessary columns
tok_ds = train_ds.map(tok_func, batched=True, remove_columns=("anchor","target","context",'title','inputs','id', 'section', 'classes', 'sectok'))

test_ds = test_ds.map(tok_func, batched=True, remove_columns=('anchor', 'target', 'context', 'title', 'section', 'classes', 'sectok'))



In [None]:
tok_ds[0] #see row one 

**Split data, create train and validation set**

In [None]:
#train and validation set ("test")
dds = tok_ds.train_test_split(0.25, seed=42)
dds

*submissions are evaluated on the Pearson correlation coefficient between the predicted and actual similarity scores.* r ranges between -1 (inverse correlation) and 1 (+ve correlation). 

In [None]:
def corr(eval_pred): return {'pearson': np.corrcoef(*eval_pred)[0][1]}

**Training our model**

In [None]:
# !pip install transformers

In [None]:
# Transformers uses the TrainingArguments class to set up arguments.
args = TrainingArguments('outputs', learning_rate=CFG.learning_rate, warmup_ratio=0.1,
                         evaluation_strategy="epoch", per_device_train_batch_size=CFG.batch_size,
                         per_device_eval_batch_size=CFG.batch_size*2,
                         num_train_epochs=CFG.epochs, weight_decay=CFG.weight_decay, report_to='none', fp16=True)

#create model
model = AutoModelForSequenceClassification.from_pretrained(CFG.model, num_labels=1)

# a class which combines the data and model together 
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr)



In [None]:
# Train our model 

trainer.train();

In [None]:
preds = trainer.predict(test_ds).predictions.astype(float)
preds

In [None]:
#some of our predictions are <0, or >1, clip them
preds = np.clip(preds, 0, 1)

In [None]:
preds.flatten()

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': test_ds['id'],
    'score': preds.flatten()
})

submission.to_csv('submission.csv', index=False)