In [1]:
import pandas as pd
!pip install seaborn
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"



In [2]:
output=pd.read_csv('data_with_internal_info.csv', low_memory=False)
rows, columns = output.shape
print("Number of rows:", rows)
print("Number of columns:", columns)

Number of rows: 235277
Number of columns: 60


In [3]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
import string
import re
import nltk
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("stopwords")

_stopwords = set(stopwords.words("english"))
_lemmatizer = WordNetLemmatizer()

def text_prep(text_list):
    cleaned = []
    table = str.maketrans('', '', string.punctuation)
    for doc in text_list:
        doc = str(doc).lower().translate(table)
        doc = doc.replace('\n', ' ')
        doc = re.sub(r'[^a-z0-9._\s]', '', doc)
        doc = re.sub(r'\s+', ' ', doc).strip()
        tokens = [tok for tok in doc.split() if tok not in _stopwords and len(tok) > 2]
        lemmas = [_lemmatizer.lemmatize(tok) for tok in tokens]
        cleaned.append(" ".join(lemmas))
    return cleaned
columns_to_combine = [
    'Job Title (en)', 'Job Title (nl)', 'Domain (en)', 'Specialisation (en)', 'Career ladder en',
    'Level Title', 'Function goal', 'Key result areas: result area (1)', 'Key result areas: result area (2)',
    'Key result areas: result area (3)', 'Key result areas: result area (4)', 'Key result areas: result area (5)',
    'Key result areas: result area (6)', 'Key result areas: result area (7)', 'Key result areas: result area (8)',
    'Key result areas: result area (9)', 'Key result areas: result area (10)', 'Key result areas: result area (11)',
    'Key result areas: result area (12)', 'Key result areas: result area (13)', 'Key result areas: result area (14)',
    'Key result areas: result area (15)', 'HIERARCHIC MANAGEMENT', 'FUNCTIONAL MANAGEMENT',
    'Leadership | Manage the following job(s)', 'Managed by',
    'Is he/ she responsible for a certain budget/ figure?', 'Specify the budget amounts.', 'Diploma',
    'Speciality', 'Required experience', 'Innovation', 'Distinguishing factors | Row 1 - Column 1',
    'Row 2 - Column 1', 'Row 3 - Column 1', 'Row 4 - Column 1', 'Row 5 - Column 1', 'Row 6 - Column 1',
    'Row 7 - Column 1', 'Row 8 - Column 1', 'Row 9 - Column 1', 'Compas Grade', 'Reference Job', 'Age',
     'Number of employees in Belgium', 'Job Grade Reference Job', 'Diploma Category',
    'Job Level', 'Internal Job', 'Internal Job Grade', 'Department'
]
output[columns_to_combine] = output[columns_to_combine].astype(str)
output['Yearly Gross Base Salary'] = pd.to_numeric(output['Yearly Gross Base Salary'], errors='coerce')

text_combined = output[columns_to_combine].agg(' '.join, axis=1).tolist()
text_cleaned = text_prep(text_combined)

# Text pipeline: TF-IDF + SVD
text_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),
        max_features=2000,
        sublinear_tf=True,
        smooth_idf=True,
        stop_words='english',
        tokenizer=lambda x: x.split(),
        preprocessor=lambda x: x
    )),
    ('svd', TruncatedSVD(n_components=100, random_state=11))
])

X_text = text_pipe.fit_transform(text_cleaned)
X_text = pd.DataFrame(X_text, index=output.index)
X = X_text

y = output['Yearly Gross Base Salary'].fillna(output['Yearly Gross Base Salary'].mean())

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=11
)

# XGBoost regressor
model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.05,
    objective='reg:squarederror',
    random_state=11
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print(" Evaluation for Salary prediction(TF-IDF + SVD + XGBoost)")
print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred) ** 0.5:.4f}")
print(f"R²:   {r2_score(y_test, y_pred):.4f}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred):.4f}\n")

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 Evaluation for Salary prediction(TF-IDF + SVD + XGBoost)
MAE:  1983.0597
RMSE: 6832.0659
R²:   0.0815
MAPE: 0.0301



In [5]:
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb
model_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',             
    verbosity=1,
    random_state=11)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 1.0]
}
grid_search = GridSearchCV(
    model_xgb,
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=1,
    verbose=2
    
)
grid_search.fit(X_train, y_train)
print("Best hyperparameters for XGBoost:", grid_search.best_params_)
best_xgb = grid_search.best_estimator_

y_true = y_test.values.flatten()
y_pred = best_xgb.predict(X_test)

print("\n--- Evaluation for Salary prediction with XGBoost (tuned) ---")
print("  MAE: ", mean_absolute_error(y_true, y_pred))
print("  RMSE:", mean_squared_error(y_true, y_pred)**0.5)
print("  R²:  ", r2_score(y_true, y_pred))
print("  MAPE:", mean_absolute_percentage_error(y_true, y_pred))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.8s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.0s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.2s
[CV] END learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0; total time=   1.0s

In [6]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    random_state=11,
    n_jobs=-1
)
X_train.columns = X_train.columns.astype(str)
X_test .columns = X_test .columns.astype(str)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)


# Evaluation
print("\n Evaluation for SALARY prediction (Random Forest regressors with TF-IDF)")
print("  MAE:", mean_absolute_error(y_test, y_pred_rf))
print("  RMSE:", mean_squared_error(y_test, y_pred_rf) ** 0.5)
print("  R²:", r2_score(y_test, y_pred_rf))
print("  MAPE:", mean_absolute_percentage_error(y_test, y_pred_rf))


 Evaluation for SALARY prediction (Random Forest regressors with TF-IDF)
  MAE: 1868.710950102236
  RMSE: 6880.342796378206
  R²: 0.06848426260917484
  MAPE: 0.02849770239952594


In [7]:
#fine-tuning RF with GridSearchCV

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth':    [ 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf':  [1, 2],
    'max_features': ['log2', 'sqrt']}

grid_search_rf = GridSearchCV(
    estimator=model_rf,
    param_grid=param_grid_rf,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2)


grid_search_rf.fit(X_train, y_train)


best_rf = grid_search_rf.best_estimator_
print("Best hyperparameters for RandomForest:", grid_search_rf.best_params_)


y_pred_rf = best_rf.predict(X_test)

print("\nEvaluation for Salary prediction (Random Forest with GridSearch)")
print("  MAE: ", mean_absolute_error(y_test, y_pred_rf))
print("  RMSE:", mean_squared_error(y_test, y_pred_rf) ** 0.5)
print("  R²:  ", r2_score(y_test, y_pred_rf))
print("  MAPE:", mean_absolute_percentage_error(y_test, y_pred_rf))

Fitting 3 folds for each of 32 candidates, totalling 96 fits
Best hyperparameters for RandomForest: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Evaluation for Salary prediction (Random Forest with GridSearch)
  MAE:  1688.6755485620752
  RMSE: 6905.437623596581
  R²:   0.061676794583432115
  MAPE: 0.025944877729786407
