In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from setfit import SetFitModel, Trainer, sample_dataset, TrainingArguments
from setfit.data import Dataset
from sentence_transformers import SentenceTransformer, SentencesDataset
from sentence_transformers.losses import CosineSimilarityLoss
import torch
import pickle
# Load data
connection = pickle.load(open('../data/connection_clean.pkl', 'rb'))
subject = pickle.load(open('../data/subject_clean.pkl', 'rb'))
objective = pickle.load(open('../data/objective_clean.pkl', 'rb'))


### Encode Experiments

papers:
1. SetFit: https://arxiv.org/pdf/2209.11055v1.pdf
    1. outperforms GPT-3 while being smaller: https://towardsdatascience.com/sentence-transformer-fine-tuning-setfit-outperforms-gpt-3-on-few-shot-text-classification-while-d9a3788f0b4e
2. Sentence-BERT: https://arxiv.org/abs/1908.10084


Tutorials:
- https://hackernoon.com/mastering-few-shot-learning-with-setfit-for-text-classification
- https://huggingface.co/docs/setfit/v1.0.1/en/quickstart#training
- multilabel: https://huggingface.co/docs/setfit/how_to/multilabel 

In [2]:
# embedding with BERT SENTENCE embedding
# model_sent = SentenceTransformer('bert-base-nli-mean-tokens')
# connection['embedding_sent'] = connection['text'].apply(lambda x: model_sent.encode(x))
# subject['embedding_sent'] = subject['text'].apply(lambda x: model_sent.encode(x))
# objective['embedding_sent'] = objective['text'].apply(lambda x: model_sent.encode(x))

---

### CONNECTION

In [3]:
# split data
X_train, X_test, y_train, y_test = train_test_split(connection['text'], connection['connection'], test_size=0.3,
                                                    random_state=42, stratify=connection['connection'])

dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train})),
    "test" : Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))
})

train_ds = dataset["train"]
test_ds = dataset["test"]

# get sample - as per examples, this is 8 samples per class
train_ds = sample_dataset(train_ds, num_samples = 8)


In [18]:
from sentence_transformers.losses import CosineSimilarityLoss
model = SetFitModel.from_pretrained("BAAI/bge-small-en-v1.5", head_params={"solver": "liblinear", "max_iter": 100})
model.model_head

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(y_pred, y_test):
    preds = y_pred

    accuracy = accuracy_score(y_test, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
# there is a max token length as per: https://github.com/huggingface/setfit/issues/242

In [None]:
# need to set max_length to avoid OOM error: https://github.com/huggingface/setfit/issues/242
args = TrainingArguments(
    batch_size=8,
    num_epochs=5,
    max_length = 512,
    evaluation_strategy= "epoch",

)

In [24]:

# create trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    metric= compute_metrics
)

In [25]:
trainer.train()
metrics = trainer.evaluate(test_ds)
print(metrics)

Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 197.12it/s]
***** Running training *****
  Num examples = 2560
  Num epochs = 5
  Total optimization steps = 200
  Total train batch size = 64
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

---

### Subject

In [None]:
# repeat with subject
X_train, X_test, y_train, y_test = train_test_split(subject['text'], subject['subject'], test_size=0.3,
                                                    random_state=42, stratify=subject['subject'])

dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train})),
    "test" : Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))
})

train_ds = dataset["train"]
test_ds = dataset["test"]

# get sample - as per examples, this is 8 samples per class
train_ds = sample_dataset(train_ds, num_samples = 8)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    metric= compute_metrics
)

trainer.train()
metrics = trainer.evaluate(test_ds)
print(metrics)



---

### Objective



In [None]:
# repeat with subject
X = objective['text']
y = objective.iloc[:,1:14]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42, stratify=y)

dataset = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train})),
    "test" : Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))
})

train_ds = dataset["train"]
test_ds = dataset["test"]

# get sample - as per examples, this is 8 samples per class
# train_ds = sample_dataset(train_ds, num_samples = 1)
