In [None]:
import pandas as pd
import numpy as np
import re
import string
hr_data=pd.read_csv('data_with_internal_info.csv', low_memory=False)
rows, columns = hr_data.shape
print("Number of rows:", rows)
print("Number of columns:", columns)

Number of rows: 235277
Number of columns: 60


In [None]:
#embedding with RandomForestRegressor
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error)


def text_prep(text):
    text = [str(t).lower() for t in text]
    table = str.maketrans('', '', string.punctuation)
    text = [t.translate(table) for t in text]
    text = [t.replace('\n', ' ') for t in text]
    text = [re.sub(r'[^a-zA-Z0-9._\s]', '', t) for t in text]
    text = [re.sub(r'\s+', ' ', t).strip() for t in text]
    return text


columns_to_combine = [
    'Job Title (en)', 'Job Title (nl)', 'Level Title', 'Function goal', 
    'Key result areas: result area (1)', 'Key result areas: result area (2)',
    'Key result areas: result area (3)', 'Key result areas: result area (4)',
    'Key result areas: result area (5)', 'Key result areas: result area (6)',
    'Key result areas: result area (7)', 'Key result areas: result area (8)',
    'Key result areas: result area (9)', 'Key result areas: result area (10)',
    'Key result areas: result area (11)', 'Specify the budget amounts.', 
    'Diploma Category', 'Speciality', 'Required experience', 'Innovation', 
    'Row 4 - Column 1', 'Row 5 - Column 1',
    'Row 6 - Column 1', 'Row 7 - Column 1', 'Row 8 - Column 1',
    'Internal Job', 'Internal Job Grade', 'Department'
]


hr_data[columns_to_combine] = hr_data[columns_to_combine].astype(str)

text_combined = hr_data[columns_to_combine].agg(' '.join, axis=1).tolist()
text_cleaned = text_prep(text_combined)


model_Sen = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model_Sen.encode(
    text_cleaned,
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=512
)

X = pd.DataFrame(embeddings)


y = hr_data[['Compas Grade']].copy()
y = y.fillna(y.mean())


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11
)


y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


model_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    random_state=11,
    n_jobs=-1
)

model_rf.fit(X_train, y_train)


y_pred = model_rf.predict(X_test)


print("\n--- Compas Grade Prediction ---")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred) ** 0.5)
print("R2:", r2_score(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))

Batches: 100%|██████████| 460/460 [2:15:49<00:00, 17.72s/it]  



--- Compas Grade Prediction ---
MAE: 0.8325540756101526
RMSE: 1.1319117374892878
R2: 0.8473966701309117
MAPE: 0.08042440051375041


In [None]:
#Finetuning RandomForestRegressor with GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=11, n_jobs=-1),
    param_grid=param_grid_rf,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)


grid_search_rf.fit(X_train, y_train.ravel())

print("Best hyperparameters for RandomForest:", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_

# Predict
y_pred = best_rf.predict(X_test)
y_true = y_test.ravel()  

print("\n--- Evaluation for Compas Grade (RF tuned) ---")
print("  MAE: ", mean_absolute_error(y_true, y_pred))
print("  RMSE:", mean_squared_error(y_true, y_pred)**0.5)
print("  R²:  ", r2_score(y_true, y_pred))
print("  MAPE:", mean_absolute_percentage_error(y_true, y_pred))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best hyperparameters for RandomForest: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

--- Evaluation for Compas Grade (RF tuned) ---
  MAE:  0.3971869129601077
  RMSE: 0.5488356296946326
  R²:   0.9641223921783624
  MAPE: 0.037538573378889196
