In [1]:
import pandas as pd
import string
import re

data = pd.read_csv('data_with_internal_info.csv', low_memory=False)
print(f"Rows before cleaning: {len(data)}")

columns_to_combine = [
    'Domain (en)', 'Specialisation (en)', 'Career ladder en', 'Level Title',
    'HIERARCHIC MANAGEMENT', 'FUNCTIONAL MANAGEMENT', 'Required experience',
    'Innovation', 'Is he/ she responsible for a certain budget/ figure?',
    'Specify the budget amounts.', 'Diploma', 'Speciality',
    'Row 4 - Column 1', 'Row 5 - Column 1', 'Row 6 - Column 1',
    'Row 7 - Column 1', 'Row 8 - Column 1', 'Row 9 - Column 1',
    'Internal Job Grade', 'Department'
]

def _clean_token(s: str) -> str:
    table = str.maketrans("", "", string.punctuation)
    s = s.lower().translate(table)
    s = re.sub(r"[^a-zA-Z0-9._\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

data[columns_to_combine] = data[columns_to_combine].fillna("").astype(str)
data["description"] = data[columns_to_combine].agg(" ".join, axis=1).apply(_clean_token)
data["Reference Job"] = data["Reference Job"].astype(str).apply(_clean_token)

before = len(data)
data = data.drop_duplicates(subset="description", keep="first")
after = len(data)

print(f"Rows before dedup: {before}")
print(f"Rows after dedup: {after}")


Rows before cleaning: 235277
Rows before dedup: 235277
Rows after dedup: 68963


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np, torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from datasets import Dataset

X = data["description"]
y = data["Reference Job"]

label_encoder = LabelEncoder().fit(y)

X_train, X_test, y_train_str, y_test_str = train_test_split(X, y,test_size=0.40,random_state=42,stratify=y)

y_train = label_encoder.transform(y_train_str)
y_test  = label_encoder.transform(y_test_str)

print("Classes:", label_encoder.classes_)
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape: ", X_test.shape, y_test.shape)

Classes: ['aankoper' 'account manager' 'administratief bediende' 'agile coach'
 'allround technieker' 'analist developer' 'animator' 'apotheker adjunct'
 'apotheker assistent' 'apotheker titularis' 'assistent filiaalleider'
 'assistent laborant' 'assistent verkoopadministratieorderverwerking'
 'audit manager' 'bediende aankoop' 'bediende boekhouding'
 'bediende customer service 1ste lijn'
 'bediende customer service 2de lijn' 'bediende distributie'
 'bediende logistiek' 'bediende marketing' 'bediende personeelsdienst'
 'bediende productieplanning'
 'bediende verkoopadministratieorderverwerking' 'bediende voorraadbeheer'
 'bedrijfsarts' 'bedrijfsverpleegkundige' 'beleidsmedewerker'
 'bidcontracts negotiations manager' 'big data analist' 'big data manager'
 'boekhouder' 'bouwadviseurconsultant' 'brand manager'
 'business analistconsultant' 'business development manager'
 'business intelligence consultant' 'business unit manager' 'calculator'
 'category manager' 'change expert' 'chapter l

In [3]:
# Approach1: fine-tuned Bert model without data augmentation
from transformers import (RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tok(batch):
    return tokenizer(batch["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=256)
train_ds = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train})).map(tok, batched=True, remove_columns=["text"])

test_ds = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test})).map(tok, batched=True, remove_columns=["text"])

def evaluate_metrics(labels, preds, target_names=None, print_report=False):
    metrics = {
        "accuracy": accuracy_score(labels, preds),
        "macro_precision": precision_score(labels, preds, average="macro", zero_division=0),
        "macro_recall": recall_score(labels, preds, average="macro", zero_division=0),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
        "weighted_precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "weighted_recall": recall_score(labels, preds, average="weighted", zero_division=0),
        "weighted_f1": f1_score(labels, preds, average="weighted", zero_division=0),
    }
    if print_report and target_names is not None:
        print(classification_report(labels, preds, target_names=target_names, digits=3))
    return metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return evaluate_metrics(labels, preds)

# 5. Load model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_)
)

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    save_steps=1000,
    fp16=True,
    seed=42,
)

set_seed(42)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics
)


trainer.train()
eval_metrics = trainer.evaluate()
print("Trainer evaluation metrics:", eval_metrics)

pred_output = trainer.predict(test_ds)
logits = pred_output.predictions
labels = pred_output.label_ids
preds = np.argmax(logits, axis=-1)

print("\nDetailed classification report:")
metrics = evaluate_metrics(labels, preds, target_names=label_encoder.classes_, print_report=True)
print("Summary metrics:", metrics)

2025-11-13 13:30:39.750764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763040639.769517  454487 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763040639.775369  454487 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763040639.791242  454487 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763040639.791261  454487 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763040639.791264  454487 computation_placer.cc:177] computation placer alr

Map:   0%|          | 0/41377 [00:00<?, ? examples/s]

Map:   0%|          | 0/27586 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10000,1.3202


Trainer evaluation metrics: {'eval_loss': 0.2635801136493683, 'eval_accuracy': 0.9640034800261001, 'eval_macro_precision': 0.7882038811394252, 'eval_macro_recall': 0.8128539883938931, 'eval_macro_f1': 0.7943158445078622, 'eval_weighted_precision': 0.94191622471224, 'eval_weighted_recall': 0.9640034800261001, 'eval_weighted_f1': 0.9503891989159339, 'eval_runtime': 102.0155, 'eval_samples_per_second': 270.41, 'eval_steps_per_second': 33.809, 'epoch': 2.0}

Detailed classification report:
                                                precision    recall  f1-score   support

                                      aankoper      1.000     1.000     1.000       191
                               account manager      1.000     1.000     1.000       407
                       administratief bediende      1.000     1.000     1.000       557
                                   agile coach      1.000     1.000     1.000        23
                           allround technieker      0.595     1.000 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [4]:
#finding minor classes to enrich them
threshold = 50

ser = pd.Series(y_train_str)
counts = ser.value_counts()

minority_labels = counts[counts < threshold].index.tolist()

print("Labels with fewer than", threshold, "samples:")
for lbl in minority_labels:
    print(f"  • {lbl}: {counts[lbl]} samples")

Labels with fewer than 50 samples:
  • manager productontwikkeling: 49 samples
  • information security officer: 49 samples
  • projectmanager bouw: 49 samples
  • digital strategist: 49 samples
  • milieucordinator niveau a: 49 samples
  • research assistant: 47 samples
  • directeur logistiek supply chain: 47 samples
  • fiscalist: 47 samples
  • specialist distributie: 44 samples
  • pricing specialist: 44 samples
  • manager voorraadbeheer: 43 samples
  • security manager: 43 samples
  • international exportsales manager: 42 samples
  • projectleider process engineering: 42 samples
  • sustainability manager: 41 samples
  • calculator: 41 samples
  • audit manager: 41 samples
  • sales promotormerchandiser: 40 samples
  • chauffeur: 40 samples
  • expediteur: 40 samples
  • general consultant: 40 samples
  • webmaster: 40 samples
  • data protection officer: 39 samples
  • treasurer: 38 samples
  • senior research scientist: 36 samples
  • verpleegkundige gegradueerd: 36 samples
  

In [8]:
#Approach2: bert+ data augmentation with gemeni with a simple prompt

import google.generativeai as genai
import google.auth
KEY_FILE_PATH = 'cloudrun_secret.json'
try:
    credentials, project_id = google.auth.load_credentials_from_file(KEY_FILE_PATH)
    genai.configure(credentials=credentials)
except FileNotFoundError:
    print(f"Error: Key file '{KEY_FILE_PATH}' not found.")
except Exception as e:
    print(f"Error loading credentials: {e}")


generation_config = {
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 150,
}

gen_model = genai.GenerativeModel(
    "gemini-2.0-flash",generation_config=generation_config)

label_counts = data["Reference Job"].value_counts()
minority_labels = label_counts[label_counts < threshold].index.tolist()

print(f"Found {len(minority_labels)} minority labels to augment.")

synthetic_rows = []

for lbl in minority_labels:
    current = label_counts[lbl]
    needed = threshold - current

    for _ in range(needed):
        prompt = (
            f"Write an approximately 80–100 word job description for the role titled '{lbl}'. "
            "Include key responsibilities, required qualifications, and typical experience."
        )

        try:
            resp = gen_model.generate_content(prompt, generation_config=generation_config)
            text_content = getattr(resp, "text", "").strip()

            if not text_content:
                retry_config = dict(generation_config)
                retry_config["temperature"] = 0.3
                retry_config["max_output_tokens"] = 120
                resp_retry = gen_model.generate_content(prompt, generation_config=retry_config)
                text_content = getattr(resp_retry, "text", "").strip()

            if text_content:
                synthetic_rows.append({
                    "text": text_content,
                    "Reference Job": lbl
                })
                print(f"Successfully generated data for: {lbl}")
            else:
                print(f"Still no output for '{lbl}' (model refused).")

        except Exception as e:
            print(f"Error for '{lbl}': {e}")



synthetic_df = pd.DataFrame(synthetic_rows)

train_df = pd.DataFrame({
    "text": X_train, 
    "Reference Job": y_train_str })

# Combine original training data with new synthetic data
if not synthetic_df.empty:
    augmented_train_df = pd.concat([train_df, synthetic_df], ignore_index=True)
    print(f"Augmented training data. New size: {len(augmented_train_df)} rows.")
else:
    augmented_train_df = train_df.copy()
    print("No synthetic data was generated. Using original training data.")

augmented_train_df.to_csv("augmented_train_df.csv", index=False)

augmented_train_df["label"] = label_encoder.transform(augmented_train_df["Reference Job"])


def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_ds = Dataset.from_pandas(augmented_train_df[["text", "label"]]).map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"])

if 'test_ds' not in locals():
    print("Warning: 'test_ds' is not defined. Evaluation will fail.")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )[2]
    
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1,
        "f1_weighted": f1_weighted,
    }


model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_) 
)

training_args = TrainingArguments(
    output_dir="./aug_results",
    do_train=True,
    do_eval=True,
    save_strategy="no",
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    fp16=True,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics)

trainer.train()

metrics = trainer.evaluate()
print(metrics)

preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(preds_output.label_ids,preds,target_names=label_encoder.classes_, zero_division=0))

Found 79 minority labels to augment.
Successfully generated data for: labo manager
Successfully generated data for: hoofdverpleegkundige
Successfully generated data for: manager marktanalysemarktonderzoek
Successfully generated data for: manager marktanalysemarktonderzoek
Successfully generated data for: manager marktanalysemarktonderzoek
Successfully generated data for: manager milieu
Successfully generated data for: manager milieu
Successfully generated data for: manager milieu
Successfully generated data for: manager milieu
Successfully generated data for: experienced general consultant
Successfully generated data for: experienced general consultant
Successfully generated data for: experienced general consultant
Successfully generated data for: experienced general consultant
Successfully generated data for: digital project manager
Successfully generated data for: digital project manager
Successfully generated data for: digital project manager
Successfully generated data for: digital

Map:   0%|          | 0/43318 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10000,1.4867


{'eval_loss': 0.27171507477760315, 'eval_accuracy': 0.9690422678170086, 'eval_precision_macro': 0.8429989009984804, 'eval_recall_macro': 0.8592422111702411, 'eval_f1_macro': 0.8458099634764943, 'eval_f1_weighted': 0.9578440292187652, 'eval_runtime': 101.7068, 'eval_samples_per_second': 271.231, 'eval_steps_per_second': 33.911, 'epoch': 2.0}
                                                precision    recall  f1-score   support

                                      aankoper       1.00      1.00      1.00       191
                               account manager       1.00      1.00      1.00       407
                       administratief bediende       1.00      1.00      1.00       557
                                   agile coach       0.00      0.00      0.00        23
                           allround technieker       0.61      1.00      0.76        22
                             analist developer       1.00      1.00      1.00       268
                                      an

In [None]:
# Approach3: bert+ data augmentation with gemeni with a CoT prompt
few_shot_examples = """
Below are a couple of examples of how to generate job description for a job title job title. Your task is to follow the same logic.
Example 1:
Job Title: 'Senior Software Engineer'

Step-by-step thinking:
- 

Generated Output:
Technology | Backend Development | Expert | Senior | No | Yes, mentoring junior developers | 5 | High – contributes to architecture | No | N/A | Bachelor's Degree | Computer Science | Skill: Java/Spring | Skill: Cloud Platforms (AWS/Azure) | Skill: CI/CD Pipelines | Process: Agile/Scrum | N/A | N/A | 12 | Senior Software Engineer | Engineering

---

Example 2:
Job Title: 'Marketing Manager'

Step-by-step thinking:
- Domain: Sales & Marketing
- Specialisation: Digital Campaigns
- Career level: Manager
- Level title: Manager
- Hierarchical Management: Yes, manages a team of 3
- Functional Management: Yes, leads campaign strategy
- Experience: 4 years
- Innovation: Medium – develops new campaign ideas
- Budget Responsibility: Yes
- Budget Details: 75,000 EUR
- Diploma: Master's Degree in Marketing
- Key Skills: Google Analytics, Salesforce Marketing Cloud, SEO/SEM, Content Marketing
- Internal Job Grade: 11
- Department: Marketing

Generated Output:
Sales & Marketing | Digital Campaign Management | Manager | Manager | Yes, manages a team of 3 | Yes | 4 | Medium – develops new campaign ideas | Yes | 75000 EUR | Master's Degree | Marketing or Business Administration | Tool: Google Analytics | Tool: Salesforce Marketing Cloud | Skill: SEO/SEM Strategy | Skill: Content Marketing | N/A | N/A | 11 | Marketing Manager | Marketing"""


label_counts = data["Reference Job"].value_counts()
minority_labels = label_counts[label_counts < threshold].index.tolist()

# Generate synthetic samples
synthetic_rows = []

for lbl in minority_labels:
    current = label_counts[lbl]
    needed = threshold - current
    print(f"Generating {needed} CoT samples for: {lbl}")

    for _ in range(needed):
        prompt = (
            f"{few_shot_examples}\n\n"
            "Now it's your turn.\n"
            f"Think step-by-step and then generate the structured string "
            f"in the same format for '{lbl}':"
        )

        try:
            # First attempt
            resp = gen_model.generate_content(prompt)
            text_content = getattr(resp, "text", "").strip()

            # Retry if empty
            if not text_content:
                retry_cfg = dict(generation_config)
                retry_cfg["temperature"] = 0.3
                retry_cfg["max_output_tokens"] = 180

                resp_retry = gen_model.generate_content(
                    prompt,
                    generation_config=retry_cfg
                )
                text_content = getattr(resp_retry, "text", "").strip()

            if text_content:
                synthetic_rows.append({
                    "text": text_content,
                    "Reference Job": lbl
                })
                print(f" Successfully generated structured CoT sample for {lbl}")
            else:
                print(f"Failed to generate output for: {lbl}")

        except Exception as e:
            print(f"Error generating for '{lbl}': {e}")


synthetic_df = pd.DataFrame(synthetic_rows)

# Combine with original training data
train_df = pd.DataFrame({
    "text": X_train.values,
    "Reference Job": y_train_str.values
})
augmented_train_df = pd.concat([train_df, synthetic_df], ignore_index=True)
augmented_train_df.to_csv("augmented_train_few_shot.csv", index=False)

augmented_train_df["label"] = label_encoder.transform(augmented_train_df["Reference Job"])


train_ds = Dataset.from_pandas(augmented_train_df[["text", "label"]]).map(tokenize_fn, batched=True, remove_columns=["text"])

training_args = TrainingArguments(
    output_dir="./aug_results",
    do_train=True,
    do_eval=True,
    save_strategy="no",
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    fp16=True,
    seed=42,)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,)
trainer.train()

metrics = trainer.evaluate()
print(metrics)

preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(
    preds_output.label_ids,
    preds,
    target_names=label_encoder.classes_, 
    zero_division=0))

Generating 1 CoT samples for: labo manager
 Successfully generated structured CoT sample for labo manager
Generating 1 CoT samples for: hoofdverpleegkundige
 Successfully generated structured CoT sample for hoofdverpleegkundige
Generating 3 CoT samples for: manager marktanalysemarktonderzoek
 Successfully generated structured CoT sample for manager marktanalysemarktonderzoek
 Successfully generated structured CoT sample for manager marktanalysemarktonderzoek
 Successfully generated structured CoT sample for manager marktanalysemarktonderzoek
Generating 4 CoT samples for: manager milieu
 Successfully generated structured CoT sample for manager milieu
 Successfully generated structured CoT sample for manager milieu
 Successfully generated structured CoT sample for manager milieu
 Successfully generated structured CoT sample for manager milieu
Generating 4 CoT samples for: experienced general consultant
 Successfully generated structured CoT sample for experienced general consultant
 Succ

Map:   0%|          | 0/43318 [00:00<?, ? examples/s]

Step,Training Loss
10000,0.197


{'eval_loss': 0.07631072402000427, 'eval_accuracy': 0.9784310882331617, 'eval_precision_macro': 0.8957464311454745, 'eval_recall_macro': 0.9111434264559692, 'eval_f1_macro': 0.8995379028473348, 'eval_f1_weighted': 0.9716036716523028, 'eval_runtime': 101.9061, 'eval_samples_per_second': 270.7, 'eval_steps_per_second': 33.845, 'epoch': 2.0}
                                                precision    recall  f1-score   support

                                      aankoper       1.00      1.00      1.00       191
                               account manager       1.00      1.00      1.00       407
                       administratief bediende       1.00      1.00      1.00       557
                                   agile coach       1.00      1.00      1.00        23
                           allround technieker       0.61      1.00      0.76        22
                             analist developer       1.00      1.00      1.00       268
                                      anim