In [None]:
!cp -r ../input/sentencetransformer/sentence-transformers /tmp/st
!pip install /tmp/st --quiet

import some libraries and set seed

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string

from tqdm import tqdm
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset, TensorDataset, SequentialSampler, RandomSampler

def normalize_text(s):

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)
    
    def lower(text):
        return text.lower()

    return white_space_fix(remove_punc(lower(s)))

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
import torch
from torch.utils.data import DataLoader
import pandas as pd
import os
import re
import numpy as np
import math
import random

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)

import data and combine the cpc-data as shown in https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/314306

In [None]:
DEBUG_mode = True

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = ''
if DEBUG_mode:
    train_df = pd.read_csv(INPUT_DIR+'train.csv', nrows=1000)
else:
    train_df = pd.read_csv(INPUT_DIR + 'train.csv')
test_df = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"train.shape: {train_df.shape}")
print(f"test.shape: {test_df.shape}")
print(f"submission.shape: {submission.shape}")



# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    cpc = pd.read_csv('../input/cpccodes/titles.csv')
    cpc = cpc.rename(columns = {"code" : "context"})
    return cpc


cpc_texts = get_cpc_texts()
train_df = pd.merge(train_df, cpc_texts[["context","title"]], on ="context", how = "left")
train_df['title'] = train_df['title'].apply(lambda x: normalize_text(x))

construt the imputs for model

input1 is anchor + context

input2 is target + context

In [None]:
train_df['text_a'] = 'anchor:' + train_df['anchor'] + '[SEP]' + 'context:' + train_df['title']
train_df['text_b'] = 'target:' + train_df['target'] + '[SEP]' + 'context:' + train_df['title']

CV: groupy by anchor

In [None]:
anchors = train_df.anchor.unique()
np.random.shuffle(anchors)
val_prop = 0.20
val_sz = int(len(anchors)*val_prop)
val_anchors_0 = anchors[:val_sz]
val_anchors_1 = anchors[val_sz:2*val_sz]
val_anchors_2 = anchors[2*val_sz:3*val_sz]
val_anchors_3 = anchors[3*val_sz:4*val_sz]
val_anchors_4 = anchors[4*val_sz:]
val_anchors = [val_anchors_0,val_anchors_1,val_anchors_2,val_anchors_3,val_anchors_4]

k_fold_score = []

train_batch_size = 8
for fold in range(5):
    is_val = np.isin(train_df.anchor, val_anchors[fold])
    idxs = np.arange(len(train_df))
    val_idxs = idxs[is_val]
    trn_idxs = idxs[~is_val]
    print(train_df.iloc[trn_idxs].score.mean(), train_df.iloc[val_idxs].score.mean())
    
    train_samples = []
    for idx, row in train_df.iloc[trn_idxs].iterrows():
        train_samples.append(
        InputExample(texts=[row['text_a'], row['text_b']], label=row['score'])
        )
    dev_samples = []
    for idx, row in train_df.iloc[val_idxs].iterrows():
        dev_samples.append(
        InputExample(texts=[row['text_a'], row['text_b']], label=row['score'])
        )
    
    model = SentenceTransformer('../input/patentsberta/PatentSBERTa')
    
    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)
    
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=20,
              evaluation_steps=math.ceil(len(train_dataloader)),
              output_path=f'fold_{fold}')

## Inference

In [None]:
test_df = pd.merge(test_df, cpc_texts[["context","title"]], on ="context", how = "left")
test_df['title'] = test_df['title'].apply(lambda x: normalize_text(x))
test_df['text_a'] = 'anchor:' + test_df['anchor'] + '[SEP]' + 'context:' + test_df['title']
test_df['text_b'] = 'target:' + test_df['target'] + '[SEP]' + 'context:' + test_df['title']
test_df.head()

In [None]:
test_samples = []
for idx, row in test_df.iterrows():
    test_samples.append(
    {"id":row['id'],
     'texts': [row['text_a'], row['text_b']]
    }
    )

In [None]:
#Compute embeddings
sims = []
for idx, sample in enumerate(test_samples):
    id = sample['id']
    embeddings = model.encode(sample['texts'], convert_to_tensor=True)
    cosine_scores = util.cos_sim(embeddings, embeddings)
    sims.append( [
        id, cosine_scores[0][1].cpu().clone().numpy()
    ]
       )

In [None]:
sample = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
submit_csv = pd.DataFrame(sims,columns=['id','score'])
submit_csv.to_csv('submission.csv',index=False)

In [None]:
submit_csv