In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def get_word_embeddings(model, tokenizer, words):
    return get_embeddings_batch(model, tokenizer, words)

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.append([feature_names[idx] for idx in top_indices])
    return keywords

def create_word_sets_fast(model, tokenizer, keywords_list, top_n=100):
    all_keywords = list(set(word for sublist in keywords_list for word in sublist))
    embeddings = get_word_embeddings(model, tokenizer, all_keywords)

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
    labels = kmeans.fit_predict(embeddings)

    # Assign clusters to technical and non-technical categories
    cluster_0 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 0]
    cluster_1 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 1]

    # Heuristic: Assume the cluster with more domain-specific words is technical
    technical_words, non_technical_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    return technical_words[:top_n], non_technical_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
resume_keywords = extract_keywords_fast(data['Resume'].tolist())
job_desc_keywords = extract_keywords_fast(data['Job Description'].tolist())
technical_words, non_technical_words = create_word_sets_fast(model, tokenizer, resume_keywords + job_desc_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    technical_embeddings = get_embeddings_batch(model, tokenizer, technical_words)
    non_technical_embeddings = get_embeddings_batch(model, tokenizer, non_technical_words)
    avg_male_embedding = np.mean(male_embeddings, axis=0)
    avg_female_embedding = np.mean(female_embeddings, axis=0)
    avg_technical_embedding = np.mean(technical_embeddings, axis=0)
    avg_non_technical_embedding = np.mean(non_technical_embeddings, axis=0)
    technical_male_effect_size = weat_effect_size_fast(avg_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    technical_female_effect_size = weat_effect_size_fast(avg_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_male_effect_size = weat_effect_size_fast(avg_non_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_female_effect_size = weat_effect_size_fast(avg_non_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    print(f"WEAT Effect Size (Technical vs. Male): {technical_male_effect_size}")
    print(f"WEAT Effect Size (Technical vs. Female): {technical_female_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Male): {non_technical_male_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Female): {non_technical_female_effect_size}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Running WEAT for BERT
WEAT Effect Size (Technical vs. Male): -0.19896414875984192
WEAT Effect Size (Technical vs. Female): 0.19896414875984192
WEAT Effect Size (Non-Technical vs. Male): -0.11526905000209808
WEAT Effect Size (Non-Technical vs. Female): 0.11526905000209808


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def get_word_embeddings(model, tokenizer, words):
    return get_embeddings_batch(model, tokenizer, words)

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.append([feature_names[idx] for idx in top_indices])
    return keywords

def create_word_sets_fast(model, tokenizer, keywords_list, top_n=100):
    all_keywords = list(set(word for sublist in keywords_list for word in sublist))
    embeddings = get_word_embeddings(model, tokenizer, all_keywords)

    # Reduce dimensionality using PCA for better clustering
    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Apply DBSCAN clustering
    clustering = DBSCAN(eps=0.5, min_samples=2, metric='euclidean').fit(reduced_embeddings)
    labels = clustering.labels_

    # Separate words based on clusters
    cluster_0 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 0]
    cluster_1 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 1]

    # Assign clusters
    technical_words, non_technical_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    print("Sample Technical Words:", technical_words[:10])
    print("Sample Non-Technical Words:", non_technical_words[:10])

    return technical_words[:top_n], non_technical_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
resume_keywords = extract_keywords_fast(data['Resume'].tolist())
job_desc_keywords = extract_keywords_fast(data['Job Description'].tolist())
technical_words, non_technical_words = create_word_sets_fast(model, tokenizer, resume_keywords + job_desc_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    technical_embeddings = get_embeddings_batch(model, tokenizer, technical_words)
    non_technical_embeddings = get_embeddings_batch(model, tokenizer, non_technical_words)
    avg_male_embedding = np.mean(male_embeddings, axis=0)
    avg_female_embedding = np.mean(female_embeddings, axis=0)
    avg_technical_embedding = np.mean(technical_embeddings, axis=0)
    avg_non_technical_embedding = np.mean(non_technical_embeddings, axis=0)
    technical_male_effect_size = weat_effect_size_fast(avg_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    technical_female_effect_size = weat_effect_size_fast(avg_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_male_effect_size = weat_effect_size_fast(avg_non_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_female_effect_size = weat_effect_size_fast(avg_non_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    print(f"WEAT Effect Size (Technical vs. Male): {technical_male_effect_size}")
    print(f"WEAT Effect Size (Technical vs. Female): {technical_female_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Male): {non_technical_male_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Female): {non_technical_female_effect_size}")


Sample Technical Words: ['harvard', 'stanford']
Sample Non-Technical Words: ['structural', 'nutritional']
Running WEAT for BERT
WEAT Effect Size (Technical vs. Male): -0.3001048266887665
WEAT Effect Size (Technical vs. Female): 0.3001048266887665
WEAT Effect Size (Non-Technical vs. Male): -0.3287501037120819
WEAT Effect Size (Non-Technical vs. Female): 0.3287501037120819


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def get_word_embeddings(model, tokenizer, words):
    return get_embeddings_batch(model, tokenizer, words)

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.append([feature_names[idx] for idx in top_indices])
    return keywords

def create_word_sets_fast(model, tokenizer, keywords_list, top_n=100):
    all_keywords = list(set(word for sublist in keywords_list for word in sublist))
    embeddings = get_word_embeddings(model, tokenizer, all_keywords)

    # Reduce dimensionality using PCA for better clustering
    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Apply DBSCAN clustering with cosine similarity
    clustering = DBSCAN(eps=0.7, min_samples=2, metric='cosine').fit(reduced_embeddings)
    labels = clustering.labels_

    # Separate words based on clusters
    unique_labels = set(labels)
    if len(unique_labels) < 2:
        print("Warning: DBSCAN formed only one cluster. Falling back to K-Means.")
        kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
        labels = kmeans.fit_predict(reduced_embeddings)

    cluster_0 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 0]
    cluster_1 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 1]

    print(f"Cluster 0 Size: {len(cluster_0)}, Cluster 1 Size: {len(cluster_1)}")
    print("Sample Cluster 0 Words:", cluster_0[:10])
    print("Sample Cluster 1 Words:", cluster_1[:10])

    # Assign clusters based on size heuristic
    technical_words, non_technical_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    print("Final Technical Words:", technical_words[:10])
    print("Final Non-Technical Words:", non_technical_words[:10])

    return technical_words[:top_n], non_technical_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
resume_keywords = extract_keywords_fast(data['Resume'].tolist())
job_desc_keywords = extract_keywords_fast(data['Job Description'].tolist())
technical_words, non_technical_words = create_word_sets_fast(model, tokenizer, resume_keywords + job_desc_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    technical_embeddings = get_embeddings_batch(model, tokenizer, technical_words)
    non_technical_embeddings = get_embeddings_batch(model, tokenizer, non_technical_words)
    avg_male_embedding = np.mean(male_embeddings, axis=0)
    avg_female_embedding = np.mean(female_embeddings, axis=0)
    avg_technical_embedding = np.mean(technical_embeddings, axis=0)
    avg_non_technical_embedding = np.mean(non_technical_embeddings, axis=0)
    technical_male_effect_size = weat_effect_size_fast(avg_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    technical_female_effect_size = weat_effect_size_fast(avg_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_male_effect_size = weat_effect_size_fast(avg_non_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_female_effect_size = weat_effect_size_fast(avg_non_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    print(f"WEAT Effect Size (Technical vs. Male): {technical_male_effect_size}")
    print(f"WEAT Effect Size (Technical vs. Female): {technical_female_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Male): {non_technical_male_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Female): {non_technical_female_effect_size}")


Cluster 0 Size: 578, Cluster 1 Size: 877
Sample Cluster 0 Words: ['cisp', 'physiology', 'aligns', 'dba', 'architects', 'javascript', 'media', 'advanced', 'methodology', 'hypotheses']
Sample Cluster 1 Words: ['adhere', 'solution', 'sales', 'abilities', 'think', 'excellent', 'ongoing', 'prescribe', 'hospital', 'engine']
Final Technical Words: ['cisp', 'physiology', 'aligns', 'dba', 'architects', 'javascript', 'media', 'advanced', 'methodology', 'hypotheses']
Final Non-Technical Words: ['adhere', 'solution', 'sales', 'abilities', 'think', 'excellent', 'ongoing', 'prescribe', 'hospital', 'engine']
Running WEAT for BERT
WEAT Effect Size (Technical vs. Male): -0.2007535994052887
WEAT Effect Size (Technical vs. Female): 0.2007535994052887
WEAT Effect Size (Non-Technical vs. Male): -0.1131492331624031
WEAT Effect Size (Non-Technical vs. Female): 0.1131492331624031


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def get_word_embeddings(model, tokenizer, words):
    return get_embeddings_batch(model, tokenizer, words)

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.append([feature_names[idx] for idx in top_indices if feature_names[idx] not in stop_words])
    return keywords

def create_word_sets_fast(model, tokenizer, keywords_list, top_n=100):
    all_keywords = list(set(word for sublist in keywords_list for word in sublist if word not in stop_words))
    embeddings = get_word_embeddings(model, tokenizer, all_keywords)

    # Reduce dimensionality using PCA for better clustering
    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Apply Hierarchical Clustering with correct metric parameter
    clustering = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
    labels = clustering.fit_predict(reduced_embeddings)

    # Separate words based on clusters
    cluster_0 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 0]
    cluster_1 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 1]

    print(f"Cluster 0 Size: {len(cluster_0)}, Cluster 1 Size: {len(cluster_1)}")
    print("Sample Cluster 0 Words:", cluster_0[:10])
    print("Sample Cluster 1 Words:", cluster_1[:10])

    # Assign clusters based on size heuristic
    technical_words, non_technical_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    print("Final Technical Words:", technical_words[:10])
    print("Final Non-Technical Words:", non_technical_words[:10])

    return technical_words[:top_n], non_technical_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
resume_keywords = extract_keywords_fast(data['Resume'].tolist())
job_desc_keywords = extract_keywords_fast(data['Job Description'].tolist())
technical_words, non_technical_words = create_word_sets_fast(model, tokenizer, resume_keywords + job_desc_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    technical_embeddings = get_embeddings_batch(model, tokenizer, technical_words)
    non_technical_embeddings = get_embeddings_batch(model, tokenizer, non_technical_words)
    avg_male_embedding = np.mean(male_embeddings, axis=0)
    avg_female_embedding = np.mean(female_embeddings, axis=0)
    avg_technical_embedding = np.mean(technical_embeddings, axis=0)
    avg_non_technical_embedding = np.mean(non_technical_embeddings, axis=0)
    technical_male_effect_size = weat_effect_size_fast(avg_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    technical_female_effect_size = weat_effect_size_fast(avg_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_male_effect_size = weat_effect_size_fast(avg_non_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_female_effect_size = weat_effect_size_fast(avg_non_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    print(f"WEAT Effect Size (Technical vs. Male): {technical_male_effect_size}")
    print(f"WEAT Effect Size (Technical vs. Female): {technical_female_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Male): {non_technical_male_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Female): {non_technical_female_effect_size}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Cluster 0 Size: 886, Cluster 1 Size: 568
Sample Cluster 0 Words: ['workflows', 'improvement', 'mitigate', 'coach', 'cisp', 'communities', 'gsec', 'counseling', 'public', 'documentation']
Sample Cluster 1 Words: ['planners', 'pastry', 'solving', 'target', 'meets', 'making', 'lives', 'producing', 'administering', 'response']
Final Technical Words: ['planners', 'pastry', 'solving', 'target', 'meets', 'making', 'lives', 'producing', 'administering', 'response']
Final Non-Technical Words: ['workflows', 'improvement', 'mitigate', 'coach', 'cisp', 'communities', 'gsec', 'counseling', 'public', 'documentation']
Running WEAT for BERT
WEAT Effect Size (Technical vs. Male): -0.13780327141284943
WEAT Effect Size (Technical vs. Female): 0.13780327141284943
WEAT Effect Size (Non-Technical vs. Male): -0.20972082018852234
WEAT Effect Size (Non-Technical vs. Female): 0.20972082018852234


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def get_word_embeddings(model, tokenizer, words):
    return get_embeddings_batch(model, tokenizer, words)

def extract_keywords_fast(texts, num_keywords=150):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.append([feature_names[idx] for idx in top_indices if feature_names[idx] not in stop_words])
    return keywords

def refine_clusters(technical_words, non_technical_words, word_embeddings):
    tech_mean = np.mean([word_embeddings[word] for word in technical_words], axis=0)
    non_tech_mean = np.mean([word_embeddings[word] for word in non_technical_words], axis=0)

    refined_technical = []
    refined_non_technical = []

    for word, embedding in word_embeddings.items():
        if cosine(embedding, tech_mean) < cosine(embedding, non_tech_mean):
            refined_technical.append(word)
        else:
            refined_non_technical.append(word)

    return refined_technical, refined_non_technical

def create_word_sets_fast(model, tokenizer, keywords_list, top_n=100):
    all_keywords = list(set(word for sublist in keywords_list for word in sublist if word not in stop_words))
    word_embeddings = {word: get_word_embeddings(model, tokenizer, [word])[0] for word in all_keywords}

    # Reduce dimensionality using PCA for better clustering
    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(list(word_embeddings.values()))

    # Apply Hierarchical Clustering
    clustering = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
    labels = clustering.fit_predict(reduced_embeddings)

    # Separate words based on clusters
    cluster_0 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 0]
    cluster_1 = [all_keywords[i] for i in range(len(labels)) if labels[i] == 1]

    print(f"Cluster 0 Size: {len(cluster_0)}, Cluster 1 Size: {len(cluster_1)}")
    print("Sample Cluster 0 Words:", cluster_0[:10])
    print("Sample Cluster 1 Words:", cluster_1[:10])

    # Assign clusters based on size heuristic
    technical_words, non_technical_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    # Refine clusters based on embedding similarity
    technical_words, non_technical_words = refine_clusters(technical_words, non_technical_words, word_embeddings)

    print("Final Technical Words:", technical_words[:10])
    print("Final Non-Technical Words:", non_technical_words[:10])

    return technical_words[:top_n], non_technical_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
resume_keywords = extract_keywords_fast(data['Resume'].tolist())
job_desc_keywords = extract_keywords_fast(data['Job Description'].tolist())
technical_words, non_technical_words = create_word_sets_fast(model, tokenizer, resume_keywords + job_desc_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    technical_embeddings = get_embeddings_batch(model, tokenizer, technical_words)
    non_technical_embeddings = get_embeddings_batch(model, tokenizer, non_technical_words)
    avg_male_embedding = np.mean(male_embeddings, axis=0)
    avg_female_embedding = np.mean(female_embeddings, axis=0)
    avg_technical_embedding = np.mean(technical_embeddings, axis=0)
    avg_non_technical_embedding = np.mean(non_technical_embeddings, axis=0)
    technical_male_effect_size = weat_effect_size_fast(avg_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    technical_female_effect_size = weat_effect_size_fast(avg_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_male_effect_size = weat_effect_size_fast(avg_non_technical_embedding, male_embeddings, female_embeddings, technical_embeddings, non_technical_embeddings)
    non_technical_female_effect_size = weat_effect_size_fast(avg_non_technical_embedding, female_embeddings, male_embeddings, technical_embeddings, non_technical_embeddings)
    print(f"WEAT Effect Size (Technical vs. Male): {technical_male_effect_size}")
    print(f"WEAT Effect Size (Technical vs. Female): {technical_female_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Male): {non_technical_male_effect_size}")
    print(f"WEAT Effect Size (Non-Technical vs. Female): {non_technical_female_effect_size}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cluster 0 Size: 680, Cluster 1 Size: 774
Sample Cluster 0 Words: ['progress', 'human', 'cbp', 'maximum', 'dmd', 'optimize', 'pedagogy', 'semrush', 'comprehensive', 'aviation']
Sample Cluster 1 Words: ['learn', 'maintain', 'handler', 'responding', 'tested', 'launches', 'emergencies', 'google', 'sensors', 'reporting']
Final Technical Words: ['human', 'cbp', 'maximum', 'dmd', 'optimize', 'pedagogy', 'semrush', 'comprehensive', 'aviation', 'weather']
Final Non-Technical Words: ['progress', 'learn', 'maintain', 'handler', 'responding', 'tested', 'launches', 'emergencies', 'google', 'sensors']
Running WEAT for BERT
WEAT Effect Size (Technical vs. Male): -0.16136282682418823
WEAT Effect Size (Technical vs. Female): 0.16136282682418823
WEAT Effect Size (Non-Technical vs. Male): -0.1031954288482666
WEAT Effect Size (Non-Technical vs. Female): 0.1031954288482666


In [None]:
# Agglomerative Clustering
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def extract_keywords_fast(texts, num_keywords=1000):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.extend([feature_names[idx] for idx in top_indices if feature_names[idx] not in stop_words])
    return list(set(keywords))  # Ensure unique keywords

def create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords, top_n=100):
    all_keywords = list(set(stem_keywords + non_stem_keywords))  # Flatten lists if needed
    word_embeddings = {word: get_embeddings_batch(model, tokenizer, [word])[0] for word in all_keywords if isinstance(word, str) and word.strip()}

    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(list(word_embeddings.values()))

    clustering = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
    labels = clustering.fit_predict(reduced_embeddings)

    cluster_0 = [word for i, word in enumerate(all_keywords) if labels[i] == 0]
    cluster_1 = [word for i, word in enumerate(all_keywords) if labels[i] == 1]

    stem_words, non_stem_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)
    print("Final Technical Words:", stem_words[:20])
    print("Final Non-Technical Words:", non_stem_words[:20])
    return stem_words[:top_n], non_stem_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")

stem_jobs = ["Software Engineer", "Machine Learning Engineer", "AI Researcher", "Robotics Engineer", "Mechanical Engineer", "Cloud Architect", "Cybersecurity Analyst", "Web Developer", "Database Administrator", "Data Analyst", "Research Scientist", "Environmental Scientist", "Biomedical Engineer", "Physician", "Pharmacist", "Dentist", "Veterinarian", "Nurse", "Civil Engineer", "Systems Analyst", "AI Specialist", "SEO Specialist"]

stem_resumes = data[data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()
non_stem_resumes = data[~data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()

stem_keywords = extract_keywords_fast(stem_resumes)
non_stem_keywords = extract_keywords_fast(non_stem_resumes)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

stem_words, non_stem_words = create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    stem_embeddings = get_embeddings_batch(model, tokenizer, stem_words)
    non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_words)
    avg_stem_embedding = np.mean(stem_embeddings, axis=0)
    avg_non_stem_embedding = np.mean(non_stem_embeddings, axis=0)

    stem_male_effect_size = weat_effect_size_fast(avg_stem_embedding, male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
    stem_female_effect_size = weat_effect_size_fast(avg_stem_embedding, female_embeddings, male_embeddings, stem_embeddings, non_stem_embeddings)

    print(f"WEAT Effect Size (STEM vs. Male): {stem_male_effect_size}")
    print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Final Technical Words: ['roadmap', 'cbp', 'dmd', 'scrummaster', 'pedagogy', 'semrush', 'palliative', 'cbap', '13485', 'cfi', 'autocad', 'vts', 'cpce', 'xd', 'cism', 'scalability', 'fms', 'journeyman', 'tesol', 'chc']
Final Non-Technical Words: ['pharmaceuticals', 'degree', 'psychologist', 'creation', 'private', 'owner', 'human', 'geographic', 'compliance', 'handler', 'design', 'service', 'ethical', 'medicine', 'equipment', 'illustrator', 'feedback', 'psychology', 'weather', 'google']
Running WEAT for BERT
WEAT Effect Size (STEM vs. Male): -0.02400081604719162
WEAT Effect Size (STEM vs. Female): 0.02400081604719162


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.extend([feature_names[idx] for idx in top_indices if feature_names[idx] not in stop_words])
    return list(set(keywords))  # Ensure unique keywords

from sklearn.cluster import KMeans

def create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords, top_n=100):
    all_keywords = list(set(stem_keywords + non_stem_keywords))  # Flatten lists if needed
    word_embeddings = {word: get_embeddings_batch(model, tokenizer, [word])[0] for word in all_keywords if isinstance(word, str) and word.strip()}

    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(list(word_embeddings.values()))

    clustering = KMeans(n_clusters=2, random_state=42)
    labels = clustering.fit_predict(reduced_embeddings)

    cluster_0 = [word for i, word in enumerate(all_keywords) if labels[i] == 0]
    cluster_1 = [word for i, word in enumerate(all_keywords) if labels[i] == 1]

    stem_words, non_stem_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    return stem_words[:top_n], non_stem_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")

stem_jobs = ["Software Engineer", "Machine Learning Engineer", "AI Researcher", "Robotics Engineer", "Mechanical Engineer", "Cloud Architect", "Cybersecurity Analyst", "Web Developer", "Database Administrator", "Data Analyst", "Research Scientist", "Environmental Scientist", "Biomedical Engineer", "Physician", "Pharmacist", "Dentist", "Veterinarian", "Nurse", "Civil Engineer", "Systems Analyst", "AI Specialist", "SEO Specialist"]

stem_resumes = data[data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()
non_stem_resumes = data[~data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()

stem_keywords = extract_keywords_fast(stem_resumes)
non_stem_keywords = extract_keywords_fast(non_stem_resumes)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

stem_words, non_stem_words = create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    stem_embeddings = get_embeddings_batch(model, tokenizer, stem_words)
    non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_words)
    avg_stem_embedding = np.mean(stem_embeddings, axis=0)
    avg_non_stem_embedding = np.mean(non_stem_embeddings, axis=0)

    stem_male_effect_size = weat_effect_size_fast(avg_stem_embedding, male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
    stem_female_effect_size = weat_effect_size_fast(avg_stem_embedding, female_embeddings, male_embeddings, stem_embeddings, non_stem_embeddings)

    print(f"WEAT Effect Size (STEM vs. Male): {stem_male_effect_size}")
    print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")
    print("Top STEM Words:", stem_words[:20])
    print("Top Non-STEM Words:", non_stem_words[:20])
    print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Running WEAT for BERT
WEAT Effect Size (STEM vs. Male): -0.2138620913028717
WEAT Effect Size (STEM vs. Female): 0.2138620913028717
Top STEM Words: ['pharmaceuticals', 'roadmap', 'psychologist', 'scrummaster', 'medicine', 'pedagogy', 'equipment', 'psychology', 'weather', 'auditing', 'facebook', 'goal', 'certifications', 'leadership', 'gphr', 'palliative', 'transportation', 'health', 'hygiene', 'autocad']
Top Non-STEM Words: ['degree', 'creation', 'private', 'owner', 'human', 'geographic', 'cbp', 'compliance', 'handler', 'design', 'dmd', 'service', 'ethical', 'illustrator', 'feedback', 'semrush', 'google', 'insertion', 'cross', 'academy']
WEAT Effect Size (STEM vs. Female): 0.2138620913028717


In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cosine_similarity_matrix(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b, axis=1, keepdims=True)
    return np.dot(a_norm, b_norm.T)

def weat_effect_size_fast(w, a, b, s_x, s_y):
    w_a = np.mean(cosine_similarity_matrix(w.reshape(1, -1), a))
    w_b = np.mean(cosine_similarity_matrix(w.reshape(1, -1), b))
    s_all = np.vstack((s_x, s_y))
    return (w_a - w_b) / np.std(cosine_similarity_matrix(w.reshape(1, -1), s_all))

def get_embeddings_batch(model, tokenizer, text_list, batch_size=32):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_keywords_fast(texts, num_keywords=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray()[0])[-num_keywords:]
        keywords.extend([feature_names[idx] for idx in top_indices if feature_names[idx] not in stop_words])
        filtered_keywords = []
    for word in set(keywords):
        tagged_word = pos_tag([word])
        if tagged_word[0][1] in ['NN', 'NNS', 'JJ']:  # Keep only nouns and adjectives
            filtered_keywords.append(word)
    return filtered_keywords  # Ensure unique keywords

from sklearn.cluster import KMeans

def create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords, top_n=100):
    all_keywords = list(set(stem_keywords + non_stem_keywords))  # Flatten lists if needed
    word_embeddings = {word: get_embeddings_batch(model, tokenizer, [word])[0] for word in all_keywords if isinstance(word, str) and word.strip()}

    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(list(word_embeddings.values()))

    clustering = KMeans(n_clusters=2, random_state=42)
    labels = clustering.fit_predict(reduced_embeddings)

    cluster_0 = [word for i, word in enumerate(all_keywords) if labels[i] == 0]
    cluster_1 = [word for i, word in enumerate(all_keywords) if labels[i] == 1]

    stem_words, non_stem_words = (cluster_0, cluster_1) if len(cluster_0) < len(cluster_1) else (cluster_1, cluster_0)

    return stem_words[:top_n], non_stem_words[:top_n]

male_terms = ["man", "male", "he", "him"]
female_terms = ["woman", "female", "she", "her"]

models = {"BERT": ("bert-base-uncased", AutoModel)}

data = pd.read_csv("/content/job_applicant_dataset_trial4.csv")

stem_jobs = ["Software Engineer", "Machine Learning Engineer", "AI Researcher", "Robotics Engineer", "Mechanical Engineer", "Cloud Architect", "Cybersecurity Analyst", "Web Developer", "Database Administrator", "Data Analyst", "Research Scientist", "Environmental Scientist", "Biomedical Engineer", "Physician", "Pharmacist", "Dentist", "Veterinarian", "Nurse", "Civil Engineer", "Systems Analyst", "AI Specialist", "SEO Specialist"]

stem_resumes = data[data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()
non_stem_resumes = data[~data['Job Roles'].isin(stem_jobs)]['Resume'].tolist()

stem_keywords = extract_keywords_fast(stem_resumes)
non_stem_keywords = extract_keywords_fast(non_stem_resumes)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

stem_words, non_stem_words = create_stem_word_sets(model, tokenizer, stem_keywords, non_stem_keywords)

import random

def bootstrap_weat_test(model, tokenizer, male_terms, female_terms, stem_words, non_stem_words, num_samples=100):
    effect_sizes = []
    for _ in range(num_samples):
        random.shuffle(stem_words)
        random.shuffle(non_stem_words)
        stem_sample = stem_words[:50]
        non_stem_sample = non_stem_words[:50]

        stem_embeddings = get_embeddings_batch(model, tokenizer, stem_sample)
        non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_sample)
        avg_stem_embedding = np.mean(stem_embeddings, axis=0)
        avg_non_stem_embedding = np.mean(non_stem_embeddings, axis=0)

        effect_size = weat_effect_size_fast(avg_stem_embedding, male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
        effect_sizes.append(effect_size)
    return np.mean(effect_sizes), np.std(effect_sizes)

for model_name, (model_path, model_class) in models.items():
    print(f"Running WEAT for {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = model_class.from_pretrained(model_path)
    male_embeddings = get_embeddings_batch(model, tokenizer, male_terms)
    female_embeddings = get_embeddings_batch(model, tokenizer, female_terms)
    stem_embeddings = get_embeddings_batch(model, tokenizer, stem_words)
    non_stem_embeddings = get_embeddings_batch(model, tokenizer, non_stem_words)
    avg_stem_embedding = np.mean(stem_embeddings, axis=0)
    avg_non_stem_embedding = np.mean(non_stem_embeddings, axis=0)

    stem_male_effect_size = weat_effect_size_fast(avg_stem_embedding, male_embeddings, female_embeddings, stem_embeddings, non_stem_embeddings)
    stem_female_effect_size = weat_effect_size_fast(avg_stem_embedding, female_embeddings, male_embeddings, stem_embeddings, non_stem_embeddings)

    mean_effect, std_effect = bootstrap_weat_test(model, tokenizer, male_terms, female_terms, stem_words, non_stem_words)
    print(f"WEAT Effect Size (STEM vs. Male): {stem_male_effect_size} (±{std_effect})")
    print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")
    print("Top STEM Words:", stem_words[:20])
    print("Top Non-STEM Words:", non_stem_words[:20])
    print(f"WEAT Effect Size (STEM vs. Female): {stem_female_effect_size}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
