### This is a very simple approach to train a cross-encoder for judging how similar the anchor is to the target. There's plenty more to try out!

Most of this is adapted from [this script here](https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/cross-encoder/training_stsbenchmark.py)

In [None]:
import math
import sys
sys.path.append("../input/sentencetransformers-main")

import pandas as pd
from torch.nn import Sigmoid
from torch.utils.data import DataLoader
from sklearn.model_selection import GroupKFold
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

In [None]:
class CFG:
    kfolds = 5
    train_batch_size = 128 # These are such short inputs, batch size can be huge
    num_epochs = 3
    model = '../input/crossencodermsmarcominilml12v2'

In [None]:
full_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")


def make_samples(df, test=False):
    samples = []
    
    for anchor, target, score in df[["anchor", "target", "score"]].values:
        samples.append(
            InputExample(texts=[anchor, target], label=score),
        )
    
    return samples


gkf = GroupKFold(n_splits=CFG.kfolds)

full_df["fold"] = -1
for fold, (_, val_idx) in enumerate(gkf.split(full_df, groups=full_df["anchor"])):
    full_df.loc[val_idx, "fold"] = fold

In [None]:
print(f"There are {len(full_df)} training samples.")
print(f"There are {full_df.anchor.nunique()} different anchors.")

grouped = full_df.groupby("anchor").count()
print(f"There an average of {round(grouped.fold.mean(), 2)} targets for each anchor")

In [None]:
display(full_df.head(10))

full_df.fold.value_counts()

In [None]:
for fold in range(CFG.kfolds):
    print(f" FOLD {fold} ".center(100, "*"))
    output_path = f"fold{fold}"
    
    train_df = full_df[full_df["fold"]!=fold].copy().reset_index(drop=True)
    val_df = full_df[full_df["fold"]==fold].copy().reset_index(drop=True)    
    
    train_samples = make_samples(train_df)
    eval_samples = make_samples(val_df)

    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=CFG.train_batch_size)
    evaluator = CECorrelationEvaluator.from_input_examples(eval_samples)
    
    warmup_steps = math.ceil(len(train_dataloader) * CFG.num_epochs * 0.1) #10% of train data for warm-up
    
    model = CrossEncoder(CFG.model, num_labels=1)
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=CFG.num_epochs,
        warmup_steps=warmup_steps,
        output_path=output_path
    )
    
    results = pd.read_csv(f"{output_path}/CECorrelationEvaluator_results.csv").drop(columns=["steps"])
    display(results)

# Test set

In [None]:
test_df = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

all_predictions = None

for fold in range(CFG.kfolds):
    output_path = f"fold{fold}"
    model = CrossEncoder(output_path)
    
    predictions = model.predict(
        sentences=test_df[["anchor", "target"]].values,
        activation_fct=Sigmoid(),
        batch_size=1024,
        apply_softmax=True
    )
    if all_predictions is None:
        all_predictions = predictions/CFG.kfolds
    else:
        all_predictions += predictions/CFG.kfolds

In [None]:
sub_df = pd.DataFrame(data={
    "id": test_df["id"],
    "score": all_predictions,
})

sub_df.to_csv("submission.csv", index=False)

sub_df