In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load and clean data
df = pd.read_csv("efterlysninger_1500.csv")

# Clean label column
df["Signalement?"] = pd.to_numeric(df["Signalement?"], errors="coerce").fillna(0).astype(int)
df["label"] = df["Signalement?"]

# Clean text column - simpler approach
df = df.dropna(subset=['Text'])
df['text'] = df['Text'].astype(str).str.strip()
df = df[df['text'] != ''].copy()

print(f"Dataset size after cleaning: {len(df)}")

# Validation
assert not df['text'].isna().any(), "Still have missing text values"
assert not df['label'].isna().any(), "Still have missing label values"
print("Data validation passed!")

# Prepare dataset
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.3, seed=49)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Load model
model = SetFitModel.from_pretrained("JohanHeinsen/Old_News_Segmentation_SBERT_V0.1")

# Define metrics function
def compute_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    return {
        "accuracy": acc, 
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Train
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    metric=compute_metrics,
    batch_size=24,
    num_iterations=12, #44
    num_epochs=1,
    learning_rate=2e-5
)

print("Starting training...")
trainer.train()
metrics = trainer.evaluate()
print("Metrics:", metrics)

Dataset size after cleaning: 1491
Data validation passed!
Train dataset size: 1043
Test dataset size: 448


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 1043/1043 [00:00<00:00, 42948.61 examples/s]


Starting training...


***** Running training *****
  Num unique pairs = 25032
  Batch size = 24
  Num epochs = 1
  0%|          | 1/1043 [00:02<43:40,  2.51s/it]

{'embedding_loss': 0.255, 'grad_norm': 2.649540662765503, 'learning_rate': 0.0, 'epoch': 0.0}


  1%|          | 12/1043 [00:21<27:07,  1.58s/it]