In [None]:
!cp -r ../input/sentencetransformer/sentence-transformers /tmp/st
!pip install /tmp/st --quiet

import some libraries and set seed

In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
import torch
from torch.utils.data import DataLoader
import pandas as pd
import os
import re
import numpy as np
import math
import random

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=42)

import data and combine the cpc-data as shown in https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/314306

In [None]:
DEBUG_mode = True

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = ''
if DEBUG_mode:
    train = pd.read_csv(INPUT_DIR+'train.csv', nrows=1000)
else:
    train = pd.read_csv(INPUT_DIR + 'train.csv')
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")



# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt', encoding='utf-8') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results


cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)

construt the imputs for model

input1 is anchor + context

input2 is target + context

In [None]:
train['text_a'] = 'anchor:' + train['anchor'] + '[SEP]' + 'context:' + train['context_text']
train['text_b'] = 'target:' + train['target'] + '[SEP]' + 'context:' + train['context_text']

CV: groupy by anchor

In [None]:
anchors = train.anchor.unique()
np.random.shuffle(anchors)
val_prop = 0.20
val_sz = int(len(anchors)*val_prop)
val_anchors_0 = anchors[:val_sz]
val_anchors_1 = anchors[val_sz:2*val_sz]
val_anchors_2 = anchors[2*val_sz:3*val_sz]
val_anchors_3 = anchors[3*val_sz:4*val_sz]
val_anchors_4 = anchors[4*val_sz:]
val_anchors = [val_anchors_0,val_anchors_1,val_anchors_2,val_anchors_3,val_anchors_4]

k_fold_score = []

train_batch_size = 8
for fold in range(5):
    is_val = np.isin(train.anchor, val_anchors[fold])
    idxs = np.arange(len(train))
    val_idxs = idxs[is_val]
    trn_idxs = idxs[~is_val]
    print(train.iloc[trn_idxs].score.mean(), train.iloc[val_idxs].score.mean())
    
    train_samples = []
    for idx, row in train.iloc[trn_idxs].iterrows():
        train_samples.append(
        InputExample(texts=[row['text_a'], row['text_b']], label=row['score'])
        )
    dev_samples = []
    for idx, row in train.iloc[val_idxs].iterrows():
        dev_samples.append(
        InputExample(texts=[row['text_a'], row['text_b']], label=row['score'])
        )
    
    model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')
    
    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)
    
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=5,
              evaluation_steps=math.ceil(len(train_dataloader)),
              output_path=f'fold_{fold}')