In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


data = pd.read_csv('Data/balanced_resumes.csv')

# drop rows with missing values in text columns
text_columns = ['Name', 'Education', 'Work Experience', 'Skills', 'Awards']
data = data.dropna(subset=text_columns)

# extract text data and labels
X_text = data['Name'] + ' ' + data['Education'] + ' ' + data['Work Experience'] + ' ' + data['Skills'] + ' ' + data['Awards']
y = data['Fit']
gender = data['Gender']
ethnicity = data ['Ethnicity']

# vectorize the text data with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_text_vec = vectorizer.fit_transform(X_text)


# split data
X_train, X_test, y_train, y_test, gender_train, gender_test, ethnicity_train, ethnicity_test = train_test_split(X_text_vec, y, gender, ethnicity, test_size=0.2, random_state=42)


In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

svm_classifier = SVC(kernel='linear', C=1.7) 
svm_classifier.fit(X_train, y_train)

# predict on test set
y_pred_svm = svm_classifier.predict(X_test)

# nr of support vectors
n_support_vectors = len(svm_classifier.support_)

# nr of features per support vector
n_features = X_train.shape[1]

# total number of parameters = (number of support vectors * number of features) + bias term
total_params = n_support_vectors * n_features + len(svm_classifier.classes_)

print(f"Number of support vectors: {n_support_vectors}")
print(f"Number of features: {n_features}")
print(f"Total number of trainable parameters: {total_params}")


Number of support vectors: 957
Number of features: 3893
Total number of trainable parameters: 3725606


In [5]:
# train the Logistic Regression classifier

logistic_classifier = LogisticRegression(max_iter=1000)
logistic_classifier.fit(X_train, y_train)

# predict on test set
y_pred_lr = logistic_classifier.predict(X_test)

# nr of features (terms in TF-IDF)
n_features = X_train.shape[1]

# nr of parameters (coefficients + intercept)
total_params = n_features + 1

print(f"Number of features: {n_features}")
print(f"Total number of trainable parameters: {total_params}")

Number of features: 3893
Total number of trainable parameters: 3894


In [6]:
# train Random Forest

rf_classifier = RandomForestClassifier(n_estimators=90)
rf_classifier.fit(X_train, y_train)

# predict on test set
y_pred_rf = rf_classifier.predict(X_test)

num_features = X_train.shape[1]
n_estimators = 90
max_depth = 54

# Approximate calculation of trainable parameters
approx_params_per_tree = 2 * max_depth * num_features  # Assuming a rough estimate
total_trainable_params = n_estimators * approx_params_per_tree

print(f"Number of estimators (trees) in the forest: {n_estimators}")
print(f"Maximum depth of trees: {max_depth}")
print(f"Approximate trainable parameters in the Random Forest: {total_trainable_params}")



Number of estimators (trees) in the forest: 90
Maximum depth of trees: 54
Approximate trainable parameters in the Random Forest: 37839960


In [10]:
# train Gradient Boosting classifier

gb_classifier = GradientBoostingClassifier(n_estimators=90)  
gb_classifier.fit(X_train, y_train)

# predict on test set
y_pred_gb = gb_classifier.predict(X_test)

# estimating trainable parameters
num_features = X_train.shape[1]
n_estimators = gb_classifier.n_estimators_
max_depth = gb_classifier.max_depth

# estimate of params per tree
params_per_tree = 1000 

total_trainable_params = n_estimators * params_per_tree


print(f"Number of estimators (trees) in the ensemble: {n_estimators}")
print(f"Maximum depth of trees: {max_depth}")
print(f"Approximate trainable parameters in the Gradient Boosting model: {total_trainable_params}")


Number of estimators (trees) in the ensemble: 90
Maximum depth of trees: 3
Approximate trainable parameters in the Gradient Boosting model: 90000
