In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Download necessary NLTK resources
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv("job_applicant_dataset_trial4.csv")

stem_reference = ["algorithm", "neural", "processor", "dataset", "AI", "ML", "deep", "network",
                  "programming", "software", "hardware", "databases", "cybersecurity", "engineering",
                  "science", "physics", "mathematics", "robotics", "biotechnology", "statistics",
                  "machine learning", "data science", "cloud computing", "computer vision", "natural language processing",
                  "big data", "automation", "genomics", "bioinformatics", "quantum computing", "electrical engineering"]

non_stem_reference = ["management", "leadership", "marketing", "sales", "communication", "teamwork",
                       "customer", "business", "strategy", "collaboration", "writing", "counseling",
                       "creativity", "organization", "planning", "support", "human resources",
                       "social work", "public relations", "event planning", "journalism", "advertising",
                       "psychology", "education", "training", "hospitality", "tourism", "real estate"]

# Stoplist to filter out common words that don't provide meaningful classification
stoplist = {"market", "spaces", "directly", "closely", "feedback", "user", "human", "develops", "developed", "capacity"}


def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    words = re.split(r'\s+', text)
    words = [word for word in words if word and word not in stop_words]
    return ' '.join(words)

# Extract text data
resumes = df['Resume'].apply(preprocess).tolist()
job_descriptions = df['Job Description'].apply(preprocess).tolist()
all_texts = resumes + job_descriptions

# Load GPT NEO model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model = AutoModel.from_pretrained("EleutherAI/gpt-neo-125M")


"""def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if not text_list:
        return np.array([])  # Return empty array if text list is empty
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings) if embeddings else np.array([])"""

# Extract keywords using BERT embeddings + clustering
"""def extract_keywords_with_clustering(text_list, num_keywords=500, num_clusters=2):
    keywords = list(set(" ".join(text_list).split()))[:500]  # Get top 500 unique words
    embeddings = get_embeddings_batch(model, tokenizer, keywords)

    if embeddings.shape[0] < num_clusters:
        return keywords[:num_keywords // 2], keywords[num_keywords // 2:num_keywords]  # Simple split fallback

    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)
    clustering = KMeans(n_clusters=num_clusters, random_state=42)
    labels = clustering.fit_predict(reduced_embeddings)

    cluster_0 = [keywords[i] for i in range(len(keywords)) if labels[i] == 0]
    cluster_1 = [keywords[i] for i in range(len(keywords)) if labels[i] == 1]

    return cluster_0[:num_keywords], cluster_1[:num_keywords]

stem_keywords, non_stem_keywords = extract_keywords_with_clustering(resumes + job_descriptions, num_keywords=50)"""


"""def extract_keywords_with_clustering(text_list, num_keywords=1000, num_clusters=2):
    keywords = list(set(" ".join(text_list).split()))[:500]  # Get top 500 unique words
    embeddings = get_embeddings_batch(model, tokenizer, keywords)

    if embeddings.shape[0] < num_clusters:
        return keywords[:num_keywords // 2], keywords[num_keywords // 2:num_keywords]  # Simple split fallback

    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)
    clustering = KMeans(n_clusters=num_clusters, random_state=42)
    labels = clustering.fit_predict(reduced_embeddings)

    cluster_0 = [keywords[i] for i in range(len(keywords)) if labels[i] == 0]
    cluster_1 = [keywords[i] for i in range(len(keywords)) if labels[i] == 1]

    # Assign clusters based on reference words
    ref_embeddings = get_embeddings_batch(model, tokenizer, stem_reference + non_stem_reference)
    ref_labels = KMeans(n_clusters=2, random_state=42).fit_predict(ref_embeddings)

    if np.mean([labels[i] for i in range(len(stem_reference))]) < 0.5:
        stem_words, non_stem_words = cluster_0, cluster_1
    else:
        stem_words, non_stem_words = cluster_1, cluster_0

    return stem_words[:num_keywords], non_stem_words[:num_keywords]

stem_keywords, non_stem_keywords = extract_keywords_with_clustering(resumes + job_descriptions, num_keywords=50)"""


"""def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings) if embeddings else np.array([])"""

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if not text_list:
        return np.array([])  # Return empty array if text list is empty
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings) if embeddings else np.array([])

# Extract keywords using TF-IDF before classification
def extract_top_keywords(text_list, max_features=2000):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(text_list)
    feature_names = vectorizer.get_feature_names_out()
    return [word for word in feature_names if word not in stoplist]

# Extract refined keywords
top_keywords = extract_top_keywords(resumes + job_descriptions, max_features=2000)

# Extract keywords using BERT embeddings + cosine similarity classification
def extract_keywords_with_similarity(keywords, num_keywords=2000, similarity_threshold=0.3):
    word_embeddings = get_embeddings_batch(model, tokenizer, keywords)

    # Get reference embeddings
    stem_embeddings = get_embeddings_batch(model, tokenizer, stem_reference)
    non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_reference)

    def avg_similarity(word_emb, reference_emb):
        return np.mean([1 - cosine(word_emb, ref_emb) for ref_emb in reference_emb])

    stem_words, non_stem_words = [], []
    for word, emb in zip(keywords, word_embeddings):
        if emb.size == 0:
            continue  # Skip words without embeddings
        stem_sim = avg_similarity(emb, stem_embeddings)
        non_stem_sim = avg_similarity(emb, non_stem_embeddings)

        if max(stem_sim, non_stem_sim) < similarity_threshold:
            continue  # Skip words that don't strongly associate with either category

        if stem_sim > non_stem_sim:
            stem_words.append(word)
        else:
            non_stem_words.append(word)

    return stem_words[:num_keywords], non_stem_words[:num_keywords]

stem_keywords, non_stem_keywords = extract_keywords_with_similarity(top_keywords, num_keywords=500)

# Print top words from extracted word sets
print("Top STEM Words:", stem_keywords[:20])
print("Top Non-STEM Words:", non_stem_keywords[:20])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\souji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  torch.utils._pytree._register_pytree_node(
Using pad_token, but it is not set yet.


Top STEM Words: ['abgd', 'academy', 'access', 'ace', 'acls', 'acvs', 'adapt', 'addiction', 'address', 'adhere', 'adjust', 'adobe', 'advocate', 'aesthetics', 'agile', 'ai', 'algorithms', 'align', 'aligns', 'allow']
Top Non-STEM Words: ['abilities', 'ability', 'academic', 'accessible', 'accountant', 'accounting', 'accredited', 'accurate', 'achieve', 'achieved', 'actionable', 'active', 'actively', 'adapting', 'additionally', 'addressing', 'adjustments', 'administering', 'administration', 'administrator']


In [2]:
# Gender Bias Detection
male_names = df[df['Gender'] == 'Male']['Job Applicant Name'].str.split().str[0].unique().tolist()
female_names = df[df['Gender'] == 'Female']['Job Applicant Name'].str.split().str[0].unique().tolist()
stem_words, non_stem_words = stem_keywords, non_stem_keywords

male_embeddings = get_embeddings_batch(model, tokenizer, male_names)
female_embeddings = get_embeddings_batch(model, tokenizer, female_names)
stem_embeddings = get_embeddings_batch(model, tokenizer, stem_words)
non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_words)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean([1 - cosine(w, x) for x in a])
    w_b = np.mean([1 - cosine(w, x) for x in b])
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std([1 - cosine(w, x) for x in s_all])

# Compute WEAT effect sizes for gender bias
stem_male_effect_size = weat_effect_size_fast(np.mean(stem_embeddings, axis=0), male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
stem_female_effect_size = weat_effect_size_fast(np.mean(stem_embeddings, axis=0), female_embeddings, male_embeddings, stem_embeddings, non_stem_embeddings)



print(f"WEAT Effect Size (STEM vs. Male): {stem_male_effect_size}")
print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")

WEAT Effect Size (STEM vs. Male): -0.16094043653711582
WEAT Effect Size (STEM vs. Female): 0.16094043653711582


In [3]:
df['Age Group'] = df['Age'].apply(lambda x: 'Young' if x <= 35 else 'Old')

# Extract first names dynamically
first_names = df['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
first_name_embeddings = get_embeddings_batch(model, tokenizer, first_names)

# First Name Bias Detection (Race & Age)
def compute_weat_if_valid(group_a, group_b, s_x, s_y, label):
    if group_a.size == 0 or group_b.size == 0:
        print(f"Warning: {label} has empty embeddings. Skipping WEAT computation.")
        return np.nan
    return weat_effect_size_fast(np.mean(s_x, axis=0), group_a, group_b, s_x, s_y)

white_names = df[df['Race'] == 'White/Caucasian']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
black_names = df[df['Race'] == 'Negroid/Black']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

young_names = df[df['Age Group'] == 'Young']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
old_names = df[df['Age Group'] == 'Old']['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

white_embeddings = get_embeddings_batch(model, tokenizer, white_names)
black_embeddings = get_embeddings_batch(model, tokenizer, black_names)
young_embeddings = get_embeddings_batch(model, tokenizer, young_names)
old_embeddings = get_embeddings_batch(model, tokenizer, old_names)

name_race_bias_effect_size = compute_weat_if_valid(white_embeddings, black_embeddings, first_name_embeddings, first_name_embeddings, "First Name Bias - Race")
name_age_bias_effect_size = compute_weat_if_valid(young_embeddings, old_embeddings, first_name_embeddings, first_name_embeddings, "First Name Bias - Age")

print(f"WEAT Effect Size (First Name Bias - Race): {name_race_bias_effect_size}")
print(f"WEAT Effect Size (First Name Bias - Age): {name_age_bias_effect_size}")

# Race and Age Bias Detection Using Best Match Column
best_match_names = df[df['Best Match'] == 1]['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()
not_best_match_names = df[df['Best Match'] == 0]['Job Applicant Name'].dropna().str.split().str[0].unique().tolist()

best_match_embeddings = get_embeddings_batch(model, tokenizer, best_match_names)
not_best_match_embeddings = get_embeddings_batch(model, tokenizer, not_best_match_names)

racial_best_match_bias = compute_weat_if_valid(white_embeddings, black_embeddings, best_match_embeddings, not_best_match_embeddings, "Best Match Bias: White vs. Black")
age_best_match_bias = compute_weat_if_valid(young_embeddings, old_embeddings, best_match_embeddings, not_best_match_embeddings, "Best Match Bias: Young vs. Old")

print(f"WEAT Effect Size (Best Match Bias: White vs. Black): {racial_best_match_bias}")
print(f"WEAT Effect Size (Best Match Bias: Young vs. Old): {age_best_match_bias}")



WEAT Effect Size (First Name Bias - Race): -0.3927058801960939
WEAT Effect Size (First Name Bias - Age): 0.05941708806103723
WEAT Effect Size (Best Match Bias: White vs. Black): -0.3915682949151334
WEAT Effect Size (Best Match Bias: Young vs. Old): 0.05941682572298802
