In [None]:
!pip install -U sentence-transformers

In [19]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [29]:
import pandas as pd
import numpy as np
import torch
import json
import jsonlines

from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import BinaryClassificationEvaluator

In [35]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [43]:
def prepare_dataset(files):
    full_dataset = []
    for f in files:
        with jsonlines.open(f) as reader:
            for entries in reader:
                prefix = entries['prefix']
                for cont in entries['continuations']:
                    label = 0 if f.endswith("toxic") else 1
                    if cont is None or prefix is None:
                        print("None found")
                    full_dataset.append(InputExample(texts=[prefix, cont], label=label))
    
    return full_dataset

In [44]:
all_examples = prepare_dataset([
    "/kaggle/input/synthetic-data-682/toxic_to_benign.jsonl", 
    "/kaggle/input/synthetic-data-682/toxic_to_toxic.jsonl"
])

In [45]:
train_set, test_set = train_test_split(all_examples, shuffle=True, test_size=0.2)

In [46]:
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=16)
train_loss = losses.ContrastiveLoss(model=model)

In [47]:
model.fit([(train_dataloader, train_loss)], show_progress_bar=True)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/100 [00:00<?, ?it/s]

In [50]:
sentences1_eval = [t.texts[0] for t in test_set]
sentences2_eval = [t.texts[1] for t in test_set]
labels_eval = [t.label for t in test_set]

In [51]:
evaluator = BinaryClassificationEvaluator(sentences1=sentences1_eval, sentences2=sentences2_eval, labels=labels_eval, batch_size= 32, show_progress_bar=True)

In [52]:
result = evaluator.compute_metrices(model)

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [53]:
result

{'cossim': {'accuracy': 0.9975,
  'accuracy_threshold': 0.23849987983703613,
  'f1': 0.9987484355444306,
  'f1_threshold': 0.23849987983703613,
  'precision': 1.0,
  'recall': 0.9975,
  'ap': 1.0},
 'manhattan': {'accuracy': 0.9975,
  'accuracy_threshold': 19.047836303710938,
  'f1': 0.9987484355444306,
  'f1_threshold': 19.047836303710938,
  'precision': 1.0,
  'recall': 0.9975,
  'ap': 1.0},
 'euclidean': {'accuracy': 0.9975,
  'accuracy_threshold': 1.2334997653961182,
  'f1': 0.9987484355444306,
  'f1_threshold': 1.2334997653961182,
  'precision': 1.0,
  'recall': 0.9975,
  'ap': 1.0},
 'dot': {'accuracy': 0.9975,
  'accuracy_threshold': 0.23849989473819733,
  'f1': 0.9987484355444306,
  'f1_threshold': 0.23849989473819733,
  'precision': 1.0,
  'recall': 0.9975,
  'ap': 1.0}}