In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from setfit import SetFitModel, SetFitTrainer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load and clean data
df = pd.read_csv("efterlysninger_1500.csv")

# Clean label column
df["Kvinde?"] = pd.to_numeric(df["Kvinde?"], errors="coerce").fillna(0).astype(int)
df["label"] = df["Kvinde?"]

# Clean text column - simpler approach
df = df.dropna(subset=['Text'])
df['text'] = df['Text'].astype(str).str.strip()
df = df[df['text'] != ''].copy()

print(f"Dataset size after cleaning: {len(df)}")

# Validation
assert not df['text'].isna().any(), "Still have missing text values"
assert not df['label'].isna().any(), "Still have missing label values"
print("Data validation passed!")

# Prepare dataset
dataset = Dataset.from_pandas(df[["text", "label"]])
dataset = dataset.train_test_split(test_size=0.3, seed=49)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Load model
model = SetFitModel.from_pretrained("JohanHeinsen/Old_News_Segmentation_SBERT_V0.1")

# Define metrics function
def compute_metrics(preds, labels):
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    return {
        "accuracy": acc, 
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Train
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    metric=compute_metrics,
    batch_size=24,
    num_iterations=44,
    num_epochs=1,
    learning_rate=2e-5
)

print("Starting training...")
trainer.train()
metrics = trainer.evaluate()
print("Metrics:", metrics)

Dataset size after cleaning: 1500
Data validation passed!
Train dataset size: 1050
Test dataset size: 450


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 1050/1050 [00:00<00:00, 29411.70 examples/s]


Starting training...


***** Running training *****
  Num unique pairs = 92400
  Batch size = 24
  Num epochs = 1
  0%|          | 1/3850 [00:01<1:42:56,  1.60s/it]

{'embedding_loss': 0.2644, 'grad_norm': 2.91963791847229, 'learning_rate': 0.0, 'epoch': 0.0}


  1%|▏         | 50/3850 [01:50<2:04:56,  1.97s/it]

{'embedding_loss': 0.2902, 'grad_norm': 3.143937110900879, 'learning_rate': 2.5454545454545456e-06, 'epoch': 0.01}


  3%|▎         | 100/3850 [03:30<1:49:42,  1.76s/it]

{'embedding_loss': 0.1297, 'grad_norm': 1.4747860431671143, 'learning_rate': 5.142857142857142e-06, 'epoch': 0.03}


  4%|▍         | 150/3850 [04:57<1:52:58,  1.83s/it]

{'embedding_loss': 0.0355, 'grad_norm': 0.30196502804756165, 'learning_rate': 7.74025974025974e-06, 'epoch': 0.04}


  5%|▌         | 200/3850 [06:30<2:06:36,  2.08s/it]

{'embedding_loss': 0.02, 'grad_norm': 0.8009745478630066, 'learning_rate': 1.0337662337662338e-05, 'epoch': 0.05}


  6%|▋         | 250/3850 [08:02<1:47:16,  1.79s/it]

{'embedding_loss': 0.0086, 'grad_norm': 0.4462618827819824, 'learning_rate': 1.2935064935064937e-05, 'epoch': 0.06}


  8%|▊         | 300/3850 [09:41<1:53:06,  1.91s/it]

{'embedding_loss': 0.0039, 'grad_norm': 0.03445606306195259, 'learning_rate': 1.5532467532467534e-05, 'epoch': 0.08}


  9%|▉         | 350/3850 [11:06<1:38:41,  1.69s/it]

{'embedding_loss': 0.0024, 'grad_norm': 0.018141375854611397, 'learning_rate': 1.812987012987013e-05, 'epoch': 0.09}


 10%|█         | 400/3850 [12:43<2:01:41,  2.12s/it]

{'embedding_loss': 0.0019, 'grad_norm': 0.03088948503136635, 'learning_rate': 1.991919191919192e-05, 'epoch': 0.1}


 12%|█▏        | 450/3850 [14:14<1:39:17,  1.75s/it]

{'embedding_loss': 0.0007, 'grad_norm': 0.02756504714488983, 'learning_rate': 1.963059163059163e-05, 'epoch': 0.12}


 12%|█▏        | 461/3850 [14:33<1:38:24,  1.74s/it]