In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Download necessary NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Load dataset
df = pd.read_csv("../data/job_applicant_dataset.csv")

stem_reference = ["algorithm", "neural", "processor", "dataset", "AI", "ML", "deep", "network",
                  "programming", "software", "hardware", "databases", "cybersecurity", "engineering",
                  "science", "physics", "mathematics", "robotics", "biotechnology", "statistics",
                  "machine learning", "data science", "cloud computing", "computer vision", "natural language processing",
                  "big data", "automation", "genomics", "bioinformatics", "quantum computing", "electrical engineering"]

non_stem_reference = ["management", "leadership", "marketing", "sales", "communication", "teamwork",
                       "customer", "business", "strategy", "collaboration", "writing", "counseling",
                       "creativity", "organization", "planning", "support", "human resources",
                       "social work", "public relations", "event planning", "journalism", "advertising",
                       "psychology", "education", "training", "hospitality", "tourism", "real estate"]

# Stoplist to filter out common words that don't provide meaningful classification
stoplist = {"market", "spaces", "directly", "closely", "feedback", "user", "human", "develops", "developed", "capacity"}


def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    words = re.split(r'\s+', text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Extract text data
resumes = df['Resume'].apply(preprocess).tolist()
job_descriptions = df['Job Description'].apply(preprocess).tolist()
all_texts = resumes + job_descriptions

# Load BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if not text_list:
        return np.array([])  # Return empty array if text list is empty
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings) if embeddings else np.array([])

# Extract keywords using TF-IDF before classification
def extract_top_keywords(text_list, max_features=2000):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(text_list)
    feature_names = vectorizer.get_feature_names_out()
    return [word for word in feature_names if word not in stoplist]

# Extract refined keywords
top_keywords = extract_top_keywords(resumes + job_descriptions, max_features=2000)

# Extract keywords using BERT embeddings + cosine similarity classification
def extract_keywords_with_similarity(keywords, num_keywords=2000, similarity_threshold=0.6):
    word_embeddings = get_embeddings_batch(model, tokenizer, keywords)

    # Get reference embeddings
    stem_embeddings = get_embeddings_batch(model, tokenizer, stem_reference)
    non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_reference)

    def avg_similarity(word_emb, reference_emb):
        return np.mean([1 - cosine(word_emb, ref_emb) for ref_emb in reference_emb])

    stem_words, non_stem_words = [], []
    for word, emb in zip(keywords, word_embeddings):
        if emb.size == 0:
            continue  # Skip words without embeddings
        stem_sim = avg_similarity(emb, stem_embeddings)
        non_stem_sim = avg_similarity(emb, non_stem_embeddings)

        if max(stem_sim, non_stem_sim) < similarity_threshold:
            continue  # Skip words that don't strongly associate with either category

        if stem_sim > non_stem_sim:
            stem_words.append(word)
        else:
            non_stem_words.append(word)

    return stem_words[:num_keywords], non_stem_words[:num_keywords]

stem_keywords, non_stem_keywords = extract_keywords_with_similarity(top_keywords, num_keywords=500)

# Print top words from extracted word sets
print("Top STEM Words:", stem_keywords[:20])
print("Top Non-STEM Words:", non_stem_keywords[:20])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\souji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\souji\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  torch.utils._pytree._register_pytree_node(


Top STEM Words: ['actively', 'adobe', 'architectural', 'artificial', 'aspect', 'basic', 'biological', 'biology', 'biomedical', 'biotech', 'cdmp', 'chemistry', 'circuitry', 'cm', 'component', 'computer', 'computing', 'consultative', 'continuous', 'continuously']
Top Non-STEM Words: ['ability', 'academic', 'academy', 'access', 'accessible', 'accountant', 'accounting', 'accredited', 'accurate', 'ace', 'achieve', 'achieved', 'actionable', 'active', 'acvs', 'adapt', 'adapting', 'addiction', 'additionally', 'address']


In [2]:
# Gender Bias Detection
male_names = df[df['Gender'] == 'Male']['Job Applicant Name'].str.split().str[0].unique().tolist()
female_names = df[df['Gender'] == 'Female']['Job Applicant Name'].str.split().str[0].unique().tolist()
stem_words, non_stem_words = stem_keywords, non_stem_keywords

male_embeddings = get_embeddings_batch(model, tokenizer, male_names)
female_embeddings = get_embeddings_batch(model, tokenizer, female_names)
stem_embeddings = get_embeddings_batch(model, tokenizer, stem_words)
non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_words)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean([1 - cosine(w, x) for x in a])
    w_b = np.mean([1 - cosine(w, x) for x in b])
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std([1 - cosine(w, x) for x in s_all])

# Compute WEAT effect sizes for gender bias
stem_male_effect_size = weat_effect_size_fast(np.mean(stem_embeddings, axis=0), male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
stem_female_effect_size = weat_effect_size_fast(np.mean(stem_embeddings, axis=0), female_embeddings, male_embeddings, stem_embeddings, non_stem_embeddings)

non_stem_male_effect_size = weat_effect_size_fast(np.mean(non_stem_embeddings, axis=0), male_embeddings, female_embeddings, non_stem_embeddings, stem_embeddings)
non_stem_female_effect_size = weat_effect_size_fast(np.mean(non_stem_embeddings, axis=0), female_embeddings, male_embeddings, non_stem_embeddings, stem_embeddings)



print(f"WEAT Effect Size (STEM vs. Male): {stem_male_effect_size}")
print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")

print(f"WEAT Effect Size (Non-STEM vs. Male): {non_stem_male_effect_size}")
print(f"WEAT Effect Size (Non-STEM vs. Female): {non_stem_female_effect_size}")

WEAT Effect Size (STEM vs. Male): -0.28147757223393344
WEAT Effect Size (STEM vs. Female): 0.28147757223393344
WEAT Effect Size (Non-STEM vs. Male): -0.4041325215060944
WEAT Effect Size (Non-STEM vs. Female): 0.4041325215060944


In [3]:
#First Name Bias (Race Associations)
# Extract first names dynamically
first_names = df['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

# First Name Bias Detection (Race & Age)
def compute_weat_if_valid(group_a, group_b, s_x, s_y, label):
    if group_a.size == 0 or group_b.size == 0:
        print(f"Warning: {label} has empty embeddings. Skipping WEAT computation.")
        return np.nan
    return weat_effect_size_fast(np.mean(s_x, axis=0), group_a, group_b, s_x, s_y)

# Categorize names by race
white_names = df[df['Race'] == 'White/Caucasian']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
black_names = df[df['Race'] == 'Negroid/Black']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
asian_names = df[df['Race'] == 'Mongoloid/Asian']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

#Compute embeddings
white_embeddings = get_embeddings_batch(model, tokenizer, white_names)
black_embeddings = get_embeddings_batch(model, tokenizer, black_names)
asian_embeddings = get_embeddings_batch(model, tokenizer, asian_names)
first_name_embeddings = get_embeddings_batch(model, tokenizer, first_names)

#compute the weat scores
name_race_bias_white_black = compute_weat_if_valid(white_embeddings, black_embeddings, first_name_embeddings, first_name_embeddings, "First Name Bias - White vs. Black")
name_race_bias_white_asian = compute_weat_if_valid(white_embeddings, asian_embeddings, first_name_embeddings, first_name_embeddings, "First Name Bias - White vs. Asian")
name_race_bias_black_asian = compute_weat_if_valid(black_embeddings, asian_embeddings, first_name_embeddings, first_name_embeddings, "First Name Bias - Black vs. Asian")

# Print results
print(f"WEAT Effect Size (First Name Bias - White vs. Black): {name_race_bias_white_black}")
print(f"WEAT Effect Size (First Name Bias - White vs. Asian): {name_race_bias_white_asian}")
print(f"WEAT Effect Size (First Name Bias - Black vs. Asian): {name_race_bias_black_asian}")



WEAT Effect Size (First Name Bias - White vs. Black): 0.5224782999898759
WEAT Effect Size (First Name Bias - White vs. Asian): 0.9937564587283643
WEAT Effect Size (First Name Bias - Black vs. Asian): 0.4712781587384883


In [4]:
#Race Bias: Investigate
#Test Racial Bias in STEM/Non-STEM Associations Directly

racial_stem_bias_white_black = compute_weat_if_valid(white_embeddings, black_embeddings, stem_embeddings, non_stem_embeddings, "STEM Bias - White vs. Black")
racial_stem_bias_white_asian = compute_weat_if_valid(white_embeddings, asian_embeddings, stem_embeddings, non_stem_embeddings, "STEM Bias - White vs. Asian")
racial_stem_bias_black_asian = compute_weat_if_valid(black_embeddings, asian_embeddings, stem_embeddings, non_stem_embeddings, "STEM Bias - Black vs. Asian")

print(f"WEAT Effect Size (STEM Bias - White vs. Black): {racial_stem_bias_white_black}")
print(f"WEAT Effect Size (STEM Bias - White vs. Asian): {racial_stem_bias_white_asian}")
print(f"WEAT Effect Size (STEM Bias - Black vs. Asian): {racial_stem_bias_black_asian}")


WEAT Effect Size (STEM Bias - White vs. Black): 1.4525107613878447
WEAT Effect Size (STEM Bias - White vs. Asian): 2.9016678816753605
WEAT Effect Size (STEM Bias - Black vs. Asian): 1.4491571202875158


In [5]:
print("White STEM Names Count:", len(df[(df['Race'] == 'White/Caucasian') & (df['Job Roles'].str.contains('|'.join(stem_keywords), case=False, na=False))]))
print("Asian STEM Names Count:", len(df[(df['Race'] == 'Mongoloid/Asian') & (df['Job Roles'].str.contains('|'.join(stem_keywords), case=False, na=False))]))
print("Black STEM Names Count:", len(df[(df['Race'] == 'Negroid/Black') & (df['Job Roles'].str.contains('|'.join(stem_keywords), case=False, na=False))]))


White STEM Names Count: 1393
Asian STEM Names Count: 1389
Black STEM Names Count: 1400


In [6]:
#Best Match Bias (STEM vs. Non-STEM Within Groups)
# Identify STEM and Non-STEM best match names
stem_best_match_names = df[(df['Best Match'] == 1) & (df['Job Roles'].str.contains('|'.join(stem_keywords), case=False, na=False))]['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
non_stem_best_match_names = df[(df['Best Match'] == 1) & (~df['Job Roles'].str.contains('|'.join(stem_keywords), case=False, na=False))]['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

# Compute embeddings for STEM vs. Non-STEM best matches
stem_best_match_embeddings = get_embeddings_batch(model, tokenizer, stem_best_match_names)
non_stem_best_match_embeddings = get_embeddings_batch(model, tokenizer, non_stem_best_match_names)

# Compute WEAT for STEM vs. Non-STEM within racial groups
stem_race_white_black = compute_weat_if_valid(white_embeddings, black_embeddings, stem_best_match_embeddings, non_stem_best_match_embeddings, "STEM Best Match Bias - White vs. Black")
stem_race_white_asian = compute_weat_if_valid(white_embeddings, asian_embeddings, stem_best_match_embeddings, non_stem_best_match_embeddings, "STEM Best Match Bias - White vs. Asian")
stem_race_black_asian = compute_weat_if_valid(black_embeddings, asian_embeddings, stem_best_match_embeddings, non_stem_best_match_embeddings, "STEM Best Match Bias - Black vs. Asian")

# Print results
print(f"WEAT Effect Size (STEM Best Match Bias - White vs. Black): {stem_race_white_black}")
print(f"WEAT Effect Size (STEM Best Match Bias - White vs. Asian): {stem_race_white_asian}")
print(f"WEAT Effect Size (STEM Best Match Bias - Black vs. Asian): {stem_race_black_asian}")


WEAT Effect Size (STEM Best Match Bias - White vs. Black): 0.5230289892461037
WEAT Effect Size (STEM Best Match Bias - White vs. Asian): 0.9406286675042435
WEAT Effect Size (STEM Best Match Bias - Black vs. Asian): 0.4175996782581397


In [7]:
df['Age Group'] = df['Age'].apply(lambda x: 'Young' if x <= 35 else 'Old')
# Categorize names by age
young_names = df[df['Age Group'] == 'Young']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
old_names = df[df['Age Group'] == 'Old']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

# Compute embeddings
young_embeddings = get_embeddings_batch(model, tokenizer, young_names)
old_embeddings = get_embeddings_batch(model, tokenizer, old_names)

In [8]:
#AGE Bias – (Are Young/Old Names Linked to STEM?)
# Compute WEAT for age-STEM association
age_stem_bias = compute_weat_if_valid(young_embeddings, old_embeddings, stem_embeddings, non_stem_embeddings, "Age Bias - STEM vs. Non-STEM")

# Print results
print(f"WEAT Effect Size (Age Bias - STEM vs. Non-STEM): {age_stem_bias}")


WEAT Effect Size (Age Bias - STEM vs. Non-STEM): 0.010240124055470885
