In [None]:
! pip install transformers 
! pip install sentencepiece
! pip install datasets 
# !pip install kaggle_datasets

## Import libraries

In [None]:
import os
import warnings
import numpy as np
%matplotlib inline
import pandas as pd
import seaborn as sns
import datasets, transformers
import matplotlib.pylab as plt
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"
from transformers import AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer


## Load data

In [None]:
train_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test_data = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

In [None]:
train_data.head()

In [None]:
print("Shape of Training Data",train_data.shape)
print("Shape of testing data",test_data.shape)

## Check missinng values in training data

In [None]:
print("traing data")
print(train_data.isna().sum())

## check missing values in test data

In [None]:
print("testing data")
print(test_data.isna().sum())

## Count Score values

In [None]:
train_data["score"].value_counts()

## Visualize Scroe data

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,8))
plt.pie(x=train_data["score"].value_counts(), 
        colors=["skyblue","pink","green",'lightblue',"yellow"], 
        labels=[0.50 ,0.25,0.00,0.75,1.00], 
        shadow = True, 
        autopct="%1.2f%%", 
        explode = (0, 0.1,0.1,0.1,0.1)
        )
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.rcParams["font.size"] = 18
plt.bar(train_data["score"].value_counts().sort_values(ascending=True).index,
        train_data["score"].value_counts().sort_values(ascending=True),
        tick_label = train_data["score"].value_counts().sort_values(ascending=True).index,
        width=0.2,color="#dda0dd")

In [None]:
class Config:
    
    model_path = "anferico/bert-for-patents"  ## Load model from huggingface model hub
    input_path = '../input/us-patent-phrase-to-phrase-matching/train.csv'
    model = "bert-for-patents"
    learning_rate = 1e-5
    weight_decay = 0.01
    batch_size = 32    
    epochs = 5

In [None]:
table = """
A: Human Necessities
B: Operations and Transport
C: Chemistry and Metallurgy
D: Textiles
E: Fixed Constructions
F: Mechanical Engineering
G: Physics
H: Electricity
Y: Emerging Cross-Sectional Technologies
"""
splits = [i for i in table.split('\n') if i != '']
table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
table

## Load tokenizer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(Config.model_path, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)

In [None]:
train = datasets.Dataset.from_csv(Config.input_path)
train

In [None]:
def process(unit, eval = False):
    
    sig = unit['context'][0]
    prefix = table[sig]
    text = unit['anchor']
    
    return {
        **tokenizer( prefix + text, unit['target']),
        'label':unit['score']
    }

encoded_ds = train.map(process, remove_columns= ['id', 'anchor', 'target', 'context', 'score'])

In [None]:
encoded_ds = encoded_ds.train_test_split(test_size=0.1)
encoded_ds

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }


args = TrainingArguments(
    f"uspppm",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=Config.learning_rate,
    per_device_train_batch_size=Config.batch_size,
    per_device_eval_batch_size=Config.batch_size,
    num_train_epochs=Config.epochs,
    weight_decay=Config.weight_decay,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    )

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
def test_process(unit, eval = False):
    
    sig = unit['context'][0]
    prefix = table[sig]
    text = unit['anchor']
    
    return {
        **tokenizer( prefix + text, unit['target']),
        'label':-1
    }



# test = datasets.Dataset.from_csv(CFG.input_path + 'test.csv')
test = datasets.Dataset.from_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')


encoded_test = test.map(test_process, remove_columns= ['id', 'anchor', 'target', 'context'])

outputs = trainer.predict(encoded_test)
predictions = outputs.predictions.reshape(-1)

In [None]:
submission = datasets.Dataset.from_dict({
    'id': test['id'],
    'score': predictions,
})

submission.to_csv('submission.csv', index=False)

In [None]:
final_data = pd.read_csv('submission.csv')
final_data.head()