In [1]:
import pandas as pd
data=pd.read_csv('data_with_internal_info.csv', low_memory=False)

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import re
import string
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from collections import Counter
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import joblib

nltk.download("wordnet")
def text_prep(text):
    text = [str(t).lower() for t in text]
    table = str.maketrans('', '', string.punctuation)
    text = [t.translate(table) for t in text]
    # text = [re.sub(r'\d+', 'num', t) for t in text]
    text = [t.replace('\n', ' ') for t in text]
    text = [re.sub(r'[^a-zA-Z0-9._\s]', '', t) for t in text]
    text = [re.sub(r'\s+', ' ', t).strip() for t in text]
    return text

columns_to_combine = [
    'Job Title (en)', 'Job Title (nl)', 'Level Title', 'Function goal', 
    'Key result areas: result area (1)', 'Key result areas: result area (2)',
    'Key result areas: result area (3)', 'Key result areas: result area (4)',
    'Key result areas: result area (5)', 'Key result areas: result area (6)',
    'Key result areas: result area (7)', 'Key result areas: result area (8)',
    'Key result areas: result area (9)', 'Key result areas: result area (10)',
    'Key result areas: result area (11)', 'Specify the budget amounts.', 
    'Diploma Category', 'Speciality', 'Required experience', 'Innovation', 
    'Row 4 - Column 1', 'Row 5 - Column 1',
    'Row 6 - Column 1', 'Row 7 - Column 1', 'Row 8 - Column 1','Internal Job', 'Internal Job Grade', 'Department'
]

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.preprocessing import StandardScaler
os.environ["TOKENIZERS_PARALLELISM"] = "false"

data['Yearly Gross Base Salary'] = (
    data['Yearly Gross Base Salary']
        .astype(str)
        .str.replace(',', '.', regex=False))
data['Yearly Gross Base Salary'] = pd.to_numeric(
    data['Yearly Gross Base Salary'],
    errors='coerce')
data['Yearly Gross Base Salary'].fillna(
    data['Yearly Gross Base Salary'].mean())

data = data.dropna(subset=['Compas Grade', 'Yearly Gross Base Salary'])

y_grade  = data['Compas Grade'].astype(float).values.reshape(-1, 1)
y_salary = data['Yearly Gross Base Salary'].values.reshape(-1, 1)

scaler_salary   = StandardScaler()
y_salary_scaled = scaler_salary.fit_transform(y_salary)

y = np.hstack([y_grade, y_salary_scaled])

data[columns_to_combine] = data[columns_to_combine].astype(str)
text_combined           = data[columns_to_combine].agg(' '.join, axis=1).tolist()
text_cleaned            = text_prep(text_combined)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

toks = tokenizer(
    text_cleaned,
    padding='max_length',
    truncation=True,
    max_length=16,
    return_tensors='np'
)

input_ids = toks["input_ids"]
attention_mask = toks["attention_mask"]

np.savez(
    "tokenized_data.npz",
    input_ids=input_ids,
    attention_mask=attention_mask,
    labels=y)
joblib.dump(scaler_salary, "scaler_salary.pkl")

['scaler_salary.pkl']

In [4]:
arr = np.load("tokenized_data.npz")
input_ids      = arr['input_ids']
attention_mask = arr['attention_mask']
labels_all     = arr['labels']           
scaler_salary  = joblib.load("scaler_salary.pkl")


grades = labels_all[:,0].astype(int)
X_ids_train, X_ids_test, X_mask_train, X_mask_test, \
y_train,     y_test = train_test_split(
    input_ids,
    attention_mask,
    labels_all,
    test_size=0.2,
    random_state=42,
    stratify=grades
)

class JobDataset(Dataset):
    def __init__(self, ids, masks, labels):
        self.ids    = ids
        self.masks  = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids':      torch.tensor(self.ids[idx],   dtype=torch.long),
            'attention_mask': torch.tensor(self.masks[idx], dtype=torch.long),
            'labels':         torch.tensor(self.labels[idx],dtype=torch.float)
        }

train_ds = JobDataset(X_ids_train, X_mask_train, y_train)
test_ds  = JobDataset(X_ids_test,  X_mask_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True,  num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False, num_workers=2)

model_name = "distilbert-base-uncased"
bert_back  = AutoModel.from_pretrained(model_name)

class BertRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert      = bert_back
        self.dropout   = nn.Dropout(0.2)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        out  = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls  = out.last_hidden_state[:,0,:]
        x    = self.dropout(cls)
        return self.regressor(x) 


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model  = BertRegressor().to(device)  

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn   = nn.MSELoss()


model.train()
for epoch in range(3):
    total_loss = 0
   
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training", unit="batch"):
        ids   = batch['input_ids'].to(device)
        mask  = batch['attention_mask'].to(device)
        labs  = batch['labels'].to(device)

        optimizer.zero_grad()
        preds = model(ids, mask)
        loss  = loss_fn(preds, labs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} — Train Loss: {avg_loss:.4f}")

model.eval()
all_preds, all_true = [], []
for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
    with torch.no_grad():
        ids  = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labs = batch['labels'].cpu().numpy()

        out  = model(ids, mask).cpu().numpy()
        all_true.append(labs)
        all_preds.append(out)

all_true  = np.vstack(all_true)
all_preds = np.vstack(all_preds)

grade_true, salary_true_s = all_true[:,0], all_true[:,1]
grade_pred, salary_pred_s = all_preds[:,0], all_preds[:,1]

salary_true = scaler_salary.inverse_transform(salary_true_s.reshape(-1,1)).ravel()
salary_pred = scaler_salary.inverse_transform(salary_pred_s.reshape(-1,1)).ravel()

def print_metrics(y_t, y_p, name):
    print(f"\n=== {name} ===")
    print("MAE: ", mean_absolute_error(y_t, y_p))
    print("RMSE:", mean_squared_error(y_t, y_p)** 0.5)
    print("R2:  ", r2_score(y_t, y_p))
    print("MAPE:", mean_absolute_percentage_error(y_t, y_p))
print_metrics(grade_true,  grade_pred,  "Compas Grade")
print_metrics(salary_true, salary_pred, "Yearly Salary (EUR)")

2025-11-07 19:24:49.057278: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762543489.079979  262681 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762543489.088267  262681 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762543489.110125  262681 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762543489.110156  262681 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762543489.110158  262681 computation_placer.cc:177] computation placer alr

Epoch 1 — Train Loss: 1.2743


Epoch 2 Training: 100%|██████████| 2941/2941 [27:22<00:00,  1.79batch/s]


Epoch 2 — Train Loss: 0.4737


Epoch 3 Training: 100%|██████████| 2941/2941 [27:25<00:00,  1.79batch/s]


Epoch 3 — Train Loss: 0.4569


Evaluating: 100%|██████████| 736/736 [01:39<00:00,  7.36batch/s]


=== Compas Grade ===
MAE:  0.521529495716095
RMSE: 0.6564356677485514
R2:   0.9488176703453064
MAPE: 0.05226004123687744

=== Yearly Salary (EUR) ===
MAE:  11539.0478515625
RMSE: 15679.535707411747
R2:   0.5710670351982117
MAPE: 0.18734505772590637





In [5]:
sample_job_description=""" 
Working at Randstad is unlike working at any organization. Because at Randstad we put people at the heart of everything we do. This goes for our clients, our talent, our employees and society. We combine our passion for people with the power of today’s technologies. This helps us support people and organizations in realizing their true potential. Learn more about our mission, history and values on our website: www.randstad.com
how you will contribute :

As a Machine Learning Engineer / Data Scientist you will create brand new insights into the global world of work. We use the latest ML and AI techniques to deliver highly accurate models for supporting a variety of business applications and processes, including market demand, talent supply, career patterns, salaries and rates, and pricing.

The focus in this role will be working on our global taxonomy program, that is supporting our systems landscape worldwide. In this program we are creating and maintaining information on all kinds of vital elementary data concepts, such as jobs, occupations, skills, certificates, education and more. We are using the latest ML technology and LLMs to create our models, and present the results via ML pipelines with a balanced set of microservices.

You will be a part of an international and agile team of highly skilled specialists, including data scientists, ML engineers, data engineers, and data analysts. We have a major research background and are publishing some of our greatest work in academic papers.

You will work closely with other colleagues within the Randstad ecosystem, especially in the various global projects. The impact of your work depends not only on the quality of your products, but especially on your ability to communicate the value and proof of concept of your models and analyses.

what you will be doing 

building Machine Learning (ML) models: preparing data from a wide variety of internal and external data sources; creating and evaluating ML models, primarily in the NLP domain, based a.o. on open source LLMs and GenAI; 

● building ML pipelines that automate the training and deployment of models

● defining and appropriating work to the product backlog

● training and coaching junior team members

● collaborating within the data science community within the Randstad ecosystem, on sharing best practices, harmonizing ML models, and grow collective data science skills
who will you work with 

Our global data science team helps our customers make better hiring decisions and helps talent advance in their career paths. We help people find work by creating new insights into the dynamics of the global labor market and integrate those insights into customer- and talent-focused data products. We build and deploy machine learning models and tell stories with those insights. As an example - we create clarity where there are gaps in diversity, what a talent's next job might be, how market conditions should influence a hiring strategy, where we predict demand will go, and much more.

what you will bring 

● Masters degree in Computer science / Data Science, or related technical studies (M.Sc. or equivalent experience)
● min. 2 to 3 years work experience as a ML engineer or data scientist with strong engineering skills
● machine learning (ML) platform: excellent working knowledge of analytical, AI and ML stacks on cloud platforms, preferably on Google Cloud Platform (incl. BigQuery, AI & ML stack) or comparable platforms
● coding: high proficiency in Python (incl. relevant libraries and packages) and SQL
● ML engineering: experience with the creation of ML pipelines and ML Ops related techniques
● ML modeling: relevant experience with NLP, LLMs, and working with the set of tools needed to build models on large and complex datasets (preferably in applied and research setting)

what’s in it for you 

We put people at the heart of everything we do. Our employment conditions reflect this;

● competitive salary
● an additional flexible benefit budget
● attractive bonus scheme
● option to take part in Randstad's success through our employee Share Purchase Plan
● option to go on a sabbatical leave or buy extra holidays
● flexible working hours in the office or hybrid]
● attractive mobility arrangements
● opportunity to give back to local communities through paid volunteering leave
● a generous budget to set up your home office space and a net internet allowance]

● growth and development opportunities in a fast changing global environment

● working with great people and being part of a network where everyone wants you to succeed

● an environment where differences are understood, valued and celebrated

We want our teams and talent to reflect the rich diversity of the societies we serve. We thrive for an environment of belonging, safety and confidence. So everyone can bring their whole selves to work and flourish. Learn more about equity, diversity, inclusion and belonging at randstad here.

If you recognize yourself in the profile above, we invite you to apply for this role.

For more information you can reach out to our recruitment business partner Jenny Roberts .

The recruitment procedure consists of a screening and at least two interviews. Later in the process, an (online) assessment and a job offer conversation take place

"""

In [6]:
# 1) Preprocess
clean_text = text_prep([sample_job_description])

# 2) Tokenize
tok = tokenizer(
    clean_text,
    padding='max_length',
    truncation=True,
    max_length=16,   # must match training
    return_tensors='pt'
)

ids  = tok.input_ids.to(device)
mask = tok.attention_mask.to(device)

# 3) Predict with BERT regressor
model.eval()
with torch.no_grad():
    pred = model(ids, mask).cpu().numpy()[0]

# Outputs: [grade, salary_scaled]
grade_pred  = pred[0]
salary_pred_s = pred[1]

# 4) Convert salary back
salary_pred = scaler_salary.inverse_transform([[salary_pred_s]])[0][0]

print("\n=== Prediction ===")
print("Estimated Compas Grade:", round(grade_pred, 2))
print("Estimated Salary (EUR):", int(salary_pred))



=== Prediction ===
Estimated Compas Grade: 10.67
Estimated Salary (EUR): 61734


In [None]:
import os, csv, datetime as dt

salary_eur   = int(salary_pred)
compas_grade = float(round(grade_pred, 2))

def ask_and_save(salary, grade, job_desc, out_path="outputs/prediction_feedback.csv"):
    while True:
        try:
            resp = input("Do you accept the result? [y/n]: ").strip().lower()
        except EOFError:
            print("No input available (non-interactive). Skipping save.")
            return
        if resp in ("y", "yes", "n", "no"):
            break
        print("Please answer with y/yes or n/no.")

    if resp.startswith("n"):
        folder = os.path.dirname(out_path)
        if folder:
            os.makedirs(folder, exist_ok=True)

        write_header = not os.path.exists(out_path)
        with open(out_path, "a", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=["timestamp","salary_eur","compas_grade","job_description"])
            if write_header:
                w.writeheader()
            w.writerow({
                "timestamp": dt.datetime.now().isoformat(timespec="seconds"),
                "salary_eur": salary,
                "compas_grade": grade,
                "job_description": job_desc
            })
        print(f"Saved to {out_path}")
    else:
        print("Not saved.")

ask_and_save(salary_eur, compas_grade, sample_job_description)
