In [1]:
import pandas as pd
import string
import re
data=pd.read_csv('data_with_internal_info.csv', low_memory=False)
rows, columns = data.shape
print("Number of rows:", rows)
print("Number of columns:", columns)

Number of rows: 235277
Number of columns: 60


In [2]:
columns_to_combine = ['Domain (en)','Specialisation (en)', 'Career ladder en','Level Title', 'HIERARCHIC MANAGEMENT',
'FUNCTIONAL MANAGEMENT', 'Required experience', 'Innovation','Is he/ she responsible for a certain budget/ figure?','Specify the budget amounts.','Diploma', 'Speciality', 'Row 4 - Column 1', 'Row 5 - Column 1',
    'Row 6 - Column 1', 'Row 7 - Column 1', 'Row 8 - Column 1',
    'Row 9 - Column 1', 'Internal Job Grade','Department'
]
data["description"] = (
    data[columns_to_combine].fillna("").astype(str).agg(" | ".join, axis=1))

print(f"Rows before deduplication: {len(data)}")
data = data.drop_duplicates(subset="description", keep="first")
print(f"Rows after deduplication: {len(data)}")

Rows before deduplication: 235277
Rows after deduplication: 69351


In [3]:


def _clean_token(s: str) -> str:
    table = str.maketrans("", "", string.punctuation)
    s = s.lower().translate(table)          
    s = re.sub(r"[^a-zA-Z0-9._\s]", "", s)  
    s = re.sub(r"\s+", " ", s).strip()
    return s

df = data.copy()                             
df[columns_to_combine] = df[columns_to_combine].fillna("").astype(str)

combined = df[columns_to_combine].agg(" ".join, axis=1)   # series of strings
df["description"] = combined.apply(_clean_token)
df["Reference Job"] = df["Reference Job"].astype(str).apply(_clean_token)
before = len(df)
df = df.drop_duplicates(subset="description", keep="first")
after = len(df)

print(f"Rows before dedup : {before}")
print(f"Rows after  dedup : {after}")

Rows before dedup : 69351
Rows after  dedup : 68963


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np, torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report
from datasets import Dataset

X = df["description"]
y = df["Reference Job"]

label_encoder = LabelEncoder().fit(y)

X_train, X_test, y_train_str, y_test_str = train_test_split(X, y,test_size=0.40,random_state=42,stratify=y)

y_train = label_encoder.transform(y_train_str)
y_test  = label_encoder.transform(y_test_str)

print("Classes:", label_encoder.classes_)
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape: ", X_test.shape, y_test.shape)

Classes: ['aankoper' 'account manager' 'administratief bediende' 'agile coach'
 'allround technieker' 'analist developer' 'animator' 'apotheker adjunct'
 'apotheker assistent' 'apotheker titularis' 'assistent filiaalleider'
 'assistent laborant' 'assistent verkoopadministratieorderverwerking'
 'audit manager' 'bediende aankoop' 'bediende boekhouding'
 'bediende customer service 1ste lijn'
 'bediende customer service 2de lijn' 'bediende distributie'
 'bediende logistiek' 'bediende marketing' 'bediende personeelsdienst'
 'bediende productieplanning'
 'bediende verkoopadministratieorderverwerking' 'bediende voorraadbeheer'
 'bedrijfsarts' 'bedrijfsverpleegkundige' 'beleidsmedewerker'
 'bidcontracts negotiations manager' 'big data analist' 'big data manager'
 'boekhouder' 'bouwadviseurconsultant' 'brand manager'
 'business analistconsultant' 'business development manager'
 'business intelligence consultant' 'business unit manager' 'calculator'
 'category manager' 'change expert' 'chapter l

In [None]:
# Approach1: fine-tuned Bert model without data augmentation
from transformers import (RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tok(batch):
    return tokenizer(batch["text"],
                     truncation=True,
                     padding="max_length",
                     max_length=256)
train_ds = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train})).map(tok, batched=True, remove_columns=["text"])

test_ds = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test})).map(tok, batched=True, remove_columns=["text"])

def evaluate_metrics(labels, preds, target_names=None, print_report=False):
    metrics = {
        "accuracy": accuracy_score(labels, preds),
        "macro_precision": precision_score(labels, preds, average="macro", zero_division=0),
        "macro_recall": recall_score(labels, preds, average="macro", zero_division=0),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
        "weighted_precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "weighted_recall": recall_score(labels, preds, average="weighted", zero_division=0),
        "weighted_f1": f1_score(labels, preds, average="weighted", zero_division=0),
    }
    if print_report and target_names is not None:
        print(classification_report(labels, preds, target_names=target_names, digits=3))
    return metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return evaluate_metrics(labels, preds)

# 5. Load model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_)
)

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    save_steps=1000,
    fp16=True,
    seed=42,
)

set_seed(42)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics
)


trainer.train()
eval_metrics = trainer.evaluate()
print("Trainer evaluation metrics:", eval_metrics)

pred_output = trainer.predict(test_ds)
logits = pred_output.predictions
labels = pred_output.label_ids
preds = np.argmax(logits, axis=-1)

print("\nDetailed classification report:")
metrics = evaluate_metrics(labels, preds, target_names=label_encoder.classes_, print_report=True)
print("Summary metrics:", metrics)

2025-11-13 04:27:25.526013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763008045.549256  275173 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763008045.557077  275173 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763008045.577689  275173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763008045.577717  275173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1763008045.577720  275173 computation_placer.cc:177] computation placer alr

Map:   0%|          | 0/41377 [00:00<?, ? examples/s]

Map:   0%|          | 0/27586 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10000,1.3119


Trainer evaluation metrics: {'eval_loss': 0.26475799083709717, 'eval_accuracy': 0.9671210034075256, 'eval_macro_precision': 0.8144150571127357, 'eval_macro_recall': 0.8313584901481199, 'eval_macro_f1': 0.8168491502255224, 'eval_weighted_precision': 0.9478777513384655, 'eval_weighted_recall': 0.9671210034075256, 'eval_weighted_f1': 0.9550327446690609, 'eval_runtime': 103.0036, 'eval_samples_per_second': 267.816, 'eval_steps_per_second': 33.484, 'epoch': 2.0}

Detailed classification report:
                                                precision    recall  f1-score   support

                                      aankoper      1.000     1.000     1.000       191
                               account manager      1.000     1.000     1.000       407
                       administratief bediende      1.000     1.000     1.000       557
                                   agile coach      1.000     1.000     1.000        23
                           allround technieker      0.564     1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
#finding minor classes to enrich them
threshold = 30

ser = pd.Series(y_train_str)
counts = ser.value_counts()

minority_labels = counts[counts < threshold].index.tolist()

print("Labels with fewer than", threshold, "samples:")
for lbl in minority_labels:
    print(f"  • {lbl}: {counts[lbl]} samples")

Labels with fewer than 30 samples:
  • labo manager: 29 samples
  • hoofdverpleegkundige: 29 samples
  • manager marktanalysemarktonderzoek: 28 samples
  • manager milieu: 28 samples
  • experienced general consultant: 28 samples
  • experienced consultant accounting: 26 samples
  • kok: 26 samples
  • manager advertising promotion: 26 samples
  • digital project manager: 26 samples
  • events manager: 26 samples
  • technical writer marketing: 25 samples
  • constructie manager: 25 samples
  • consultant accounting: 25 samples
  • directeur kwaliteit: 25 samples
  • directeur aankoop: 25 samples
  • employer branding officer: 24 samples
  • pricing manager: 24 samples
  • experienced consultant legal: 23 samples
  • wellbeing specialist: 23 samples
  • experienced consultant tax: 22 samples
  • projectontwikkelaar bouw: 21 samples
  • zorgkundige: 21 samples
  • bouwadviseurconsultant: 20 samples
  • manager consulting general: 20 samples
  • bedrijfsverpleegkundige: 20 samples
  • do

In [None]:
#Approach2: bert+ data augmentation with gemeni with a simple prompt

import google.generativeai as genai
import google.auth
KEY_FILE_PATH = 'cloudrun_secret.json'
try:
    credentials, project_id = google.auth.load_credentials_from_file(KEY_FILE_PATH)
    genai.configure(credentials=credentials)
except FileNotFoundError:
    print(f"Error: Key file '{KEY_FILE_PATH}' not found.")
except Exception as e:
    print(f"Error loading credentials: {e}")


generation_config = {
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 150,
}

gen_model = genai.GenerativeModel(
    "gemini-2.0-flash",generation_config=generation_config)

label_counts = df["Reference Job"].value_counts()
minority_labels = label_counts[label_counts < threshold].index.tolist()

print(f"Found {len(minority_labels)} minority labels to augment.")

synthetic_rows = []
for lbl in minority_labels:
  for _ in range(10):
    prompt = (f"Write an approximately 80–100 word job description for the role titled “{lbl}”. "
        "Include key responsibilities, required qualifications, and typical experience.")

    try:
        resp = gen_model.generate_content(prompt, generation_config=generation_config)
        text_content = getattr(resp, "text", "").strip()

        if not text_content:
            retry_config = dict(generation_config)
            retry_config["temperature"] = 0.3
            retry_config["max_output_tokens"] = 120  # shorter safer response
            resp_retry = gen_model.generate_content(prompt, generation_config=retry_config)
            text_content = getattr(resp_retry, "text", "").strip()

        if text_content:
            synthetic_rows.append({
                "text": text_content,
                "Reference Job": lbl
            })
            print(f"Successfully generated data for: {lbl}")
        else:
            print(f" Still no output for '{lbl}' (model refused).")

    except Exception as e:
        print(f" Error for '{lbl}': {e}")


synthetic_df = pd.DataFrame(synthetic_rows)

train_df = pd.DataFrame({
    "text": X_train, 
    "Reference Job": y_train_str })

# Combine original training data with new synthetic data
if not synthetic_df.empty:
    augmented_train_df = pd.concat([train_df, synthetic_df], ignore_index=True)
    print(f"Augmented training data. New size: {len(augmented_train_df)} rows.")
else:
    augmented_train_df = train_df.copy()
    print("No synthetic data was generated. Using original training data.")

augmented_train_df.to_csv("augmented_train_df.csv", index=False)

augmented_train_df["label"] = label_encoder.transform(augmented_train_df["Reference Job"])


def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_ds = Dataset.from_pandas(augmented_train_df[["text", "label"]]).map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"])

if 'test_ds' not in locals():
    print("Warning: 'test_ds' is not defined. Evaluation will fail.")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    f1_weighted = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )[2]
    
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1,
        "f1_weighted": f1_weighted,
    }


model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_encoder.classes_) 
)

training_args = TrainingArguments(
    output_dir="./aug_results",
    do_train=True,
    do_eval=True,
    save_strategy="no",
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    fp16=True,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics)

trainer.train()

metrics = trainer.evaluate()
print(metrics)

preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(preds_output.label_ids,preds,target_names=label_encoder.classes_, zero_division=0))

Found 50 minority labels to augment.
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting strategy transformation
Successfully generated data for: manager consulting tax
Successfully generated data for: manager consulting tax
Successfully generated data for: manager consulting tax
Successfully generated data for: ma

Map:   0%|          | 0/41877 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10000,1.3881


{'eval_loss': 0.2659856379032135, 'eval_accuracy': 0.9687885159138694, 'eval_precision_macro': 0.8378661325736472, 'eval_recall_macro': 0.8485490556298851, 'eval_f1_macro': 0.8370860044336393, 'eval_f1_weighted': 0.9582759527490866, 'eval_runtime': 102.1494, 'eval_samples_per_second': 270.055, 'eval_steps_per_second': 33.764, 'epoch': 2.0}
                                                precision    recall  f1-score   support

                                      aankoper       1.00      1.00      1.00       191
                               account manager       1.00      1.00      1.00       407
                       administratief bediende       1.00      1.00      1.00       557
                                   agile coach       1.00      0.70      0.82        23
                           allround technieker       0.56      1.00      0.72        22
                             analist developer       1.00      1.00      1.00       268
                                      ani

In [None]:
# Approach3: bert+ data augmentation with gemeni with a CoT prompt
few_shot_examples = """
Below are a couple of examples of how to generate job description for a job title job title. Your task is to follow the same logic.
Example 1:
Job Title: 'Senior Software Engineer'

Step-by-step thinking:
- 

Generated Output:
Technology | Backend Development | Expert | Senior | No | Yes, mentoring junior developers | 5 | High – contributes to architecture | No | N/A | Bachelor's Degree | Computer Science | Skill: Java/Spring | Skill: Cloud Platforms (AWS/Azure) | Skill: CI/CD Pipelines | Process: Agile/Scrum | N/A | N/A | 12 | Senior Software Engineer | Engineering

---

Example 2:
Job Title: 'Marketing Manager'

Step-by-step thinking:
- Domain: Sales & Marketing
- Specialisation: Digital Campaigns
- Career level: Manager
- Level title: Manager
- Hierarchical Management: Yes, manages a team of 3
- Functional Management: Yes, leads campaign strategy
- Experience: 4 years
- Innovation: Medium – develops new campaign ideas
- Budget Responsibility: Yes
- Budget Details: 75,000 EUR
- Diploma: Master's Degree in Marketing
- Key Skills: Google Analytics, Salesforce Marketing Cloud, SEO/SEM, Content Marketing
- Internal Job Grade: 11
- Department: Marketing

Generated Output:
Sales & Marketing | Digital Campaign Management | Manager | Manager | Yes, manages a team of 3 | Yes | 4 | Medium – develops new campaign ideas | Yes | 75000 EUR | Master's Degree | Marketing or Business Administration | Tool: Google Analytics | Tool: Salesforce Marketing Cloud | Skill: SEO/SEM Strategy | Skill: Content Marketing | N/A | N/A | 11 | Marketing Manager | Marketing"""


label_counts = df["Reference Job"].value_counts()
minority_labels = label_counts[label_counts < threshold].index.tolist()

# Generate synthetic samples
synthetic_rows = []
for lbl in minority_labels:
   for _ in range(10):
        prompt = (f"{few_shot_examples}\n\n" "Now it's your turn.\n"
    f"Think step-by-step and then generate the structured string "
    f"in the same format for '{lbl}':")
        response = gen_model.generate_content(prompt)
        synthetic_rows.append({
            "text": response.text.strip(),
            "Reference Job": lbl
        })

synthetic_df = pd.DataFrame(synthetic_rows)

# Combine with original training data
train_df = pd.DataFrame({
    "text": X_train.values,
    "Reference Job": y_train_str.values
})
augmented_train_df = pd.concat([train_df, synthetic_df], ignore_index=True)
augmented_train_df.to_csv("augmented_train_few_shot.csv", index=False)

augmented_train_df["label"] = label_encoder.transform(augmented_train_df["Reference Job"])


train_ds = Dataset.from_pandas(augmented_train_df[["text", "label"]]).map(tokenize_fn, batched=True, remove_columns=["text"])

training_args = TrainingArguments(
    output_dir="./aug_results",
    do_train=True,
    do_eval=True,
    save_strategy="no",
    num_train_epochs=2,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=10000,
    fp16=True,
    seed=42,)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,)
trainer.train()

metrics = trainer.evaluate()
print(metrics)

preds_output = trainer.predict(test_ds)
preds = np.argmax(preds_output.predictions, axis=-1)
print(classification_report(
    preds_output.label_ids,
    preds,
    target_names=label_encoder.classes_, 
    zero_division=0))

Map:   0%|          | 0/41877 [00:00<?, ? examples/s]

Step,Training Loss
10000,0.1891


{'eval_loss': 0.08275485783815384, 'eval_accuracy': 0.9776335822518669, 'eval_precision_macro': 0.8847555672977103, 'eval_recall_macro': 0.9006545334570627, 'eval_f1_macro': 0.88852746825425, 'eval_f1_weighted': 0.970158930355125, 'eval_runtime': 102.0926, 'eval_samples_per_second': 270.206, 'eval_steps_per_second': 33.783, 'epoch': 2.0}
                                                precision    recall  f1-score   support

                                      aankoper       1.00      1.00      1.00       191
                               account manager       1.00      1.00      1.00       407
                       administratief bediende       1.00      1.00      1.00       557
                                   agile coach       1.00      1.00      1.00        23
                           allround technieker       0.61      1.00      0.76        22
                             analist developer       1.00      1.00      1.00       268
                                      anima

In [None]:
save_dir = "models/title"  

trainer.save_model(save_dir)

tokenizer.save_pretrained(save_dir)

joblib.dump(label_encoder, f"{save_dir}/label_encoder.pkl")

print(f"Model, tokenizer, and label encoder saved to: {save_dir}/")

NameError: name 'joblib' is not defined