# Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import random
from tqdm import tqdm

# Pre-trained Embeddings

## Load embeddings

We load pre-trained embeddings of GLoVe [1] and Word2Vec [2] from their sources and save them in our memory. From [1] and [2] we observe that the optimal dimension for saving the embeddings is 300 (which is implemented in this code as well).

For GLoVe embeddings source, go [here](https://nlp.stanford.edu/projects/glove/) (You can directly download the embeddings by running the below cell).

For Word2Vec embeddings, download the zip file from [here](https://www.kaggle.com/datasets/pkugoodspeed/nlpword2vecembeddingspretrained) and store it in a folder named word2vec. The extraction and loading of embeddings will be done below.


### Load GLoVe

Before proceeding further, confirm if you have downloaded a total of 400000 word embeddings.

In [None]:
import numpy as np
import requests
import zipfile
import os
import pickle

def download_glove(dim=300, save_dir='/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/glove'):
    if dim not in [50, 100, 200, 300]:
        raise ValueError("Embedding dimension must be one of: 50, 100, 200, 300")

    os.makedirs(save_dir, exist_ok=True)

    url = "https://nlp.stanford.edu/data/glove.6B.zip"
    zip_path = os.path.join(save_dir, "glove.6B.zip")

    if not os.path.exists(zip_path):
        print(f"Downloading GloVe embeddings from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded file to {zip_path}")
    else:
        print(f"File already exists at {zip_path}")

    embed_file = f"glove.6B.{dim}d.txt"
    extract_path = os.path.join(save_dir, embed_file)

    if not os.path.exists(extract_path):
        print(f"Extracting {embed_file}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            matching_files = [f for f in zip_ref.namelist() if f.endswith(f"{dim}d.txt")]
            if not matching_files:
                raise ValueError(f"No embedding file found with dimension {dim}")

            zip_ref.extract(matching_files[0], save_dir)
            extracted_file = os.path.join(save_dir, matching_files[0])
            if extracted_file != extract_path:
                os.rename(extracted_file, extract_path)
        print(f"Extracted to {extract_path}")
    else:
        print(f"Embeddings already extracted at {extract_path}")

    embeddings = {}
    with open(extract_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            embeddings[word] = vector
            if (i + 1) % 10000 == 0:
                print(f"Loaded {i + 1} embeddings")

    print(f"Successfully loaded {len(embeddings)} embeddings.")

    pkl_path = os.path.join(save_dir, f"glove_6B_{dim}d.pkl")
    with open(pkl_path, 'wb') as f:
        pickle.dump(embeddings, f)
    print(f"Saved embeddings to {pkl_path}")

    return embeddings

if __name__ == "__main__":
    glove_embeddings = download_glove(dim=300)

Downloading GloVe embeddings from https://nlp.stanford.edu/data/glove.6B.zip...
Downloaded file to /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/glove/glove.6B.zip
Extracting glove.6B.300d.txt...
Extracted to /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/glove/glove.6B.300d.txt
Loaded 10000 embeddings
Loaded 20000 embeddings
Loaded 30000 embeddings
Loaded 40000 embeddings
Loaded 50000 embeddings
Loaded 60000 embeddings
Loaded 70000 embeddings
Loaded 80000 embeddings
Loaded 90000 embeddings
Loaded 100000 embeddings
Loaded 110000 embeddings
Loaded 120000 embeddings
Loaded 130000 embeddings
Loaded 140000 embeddings
Loaded 150000 embeddings
Loaded 160000 embeddings
Loaded 170000 embeddings
Loaded 180000 embeddings
Loaded 190000 embeddings
Loaded 200000 embeddings
Loaded 210000 embeddings
Loaded 220000 embeddings
Loaded 230000 embeddings
Loaded 240000 embeddings
Loaded 250000 embeddings
Loaded 260000 embeddings
Loaded 270000 embeddings
Loaded 280000 embeddi

### Load Word2Vec embeddings

There are a total of 3000000 word embeddings in Word2Vec, so confirm it before proceeding further.

In [None]:
import numpy as np
import struct
import os
import pickle
import zipfile

def load_word2vec_binary(file_path, save_path=None):
    embeddings = {}

    print(f"Loading embeddings from {file_path}")
    with open(file_path, 'rb') as f:
        header = f.readline().decode('utf-8').strip().split()
        vocab_size = int(header[0])
        vector_size = int(header[1])

        print(f"Vocabulary size: {vocab_size}, Vector dimension: {vector_size}")

        for i in range(vocab_size):
            if i % 10000 == 0:
                print(f"Loaded {i}/{vocab_size} word vectors")

            word = b''
            char = f.read(1)
            while char != b' ' and char != b'\n':
                word += char
                char = f.read(1)

            word = word.decode('utf-8', errors='ignore')
            vector = np.array(struct.unpack(f'{vector_size}f', f.read(vector_size * 4)), dtype=np.float32)
            embeddings[word] = vector

    print(f"Successfully loaded {len(embeddings)} word vectors")

    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"Saving embeddings to {save_path}")
        with open(save_path, 'wb') as f:
            pickle.dump(embeddings, f)
        print("Embeddings saved successfully")

    return embeddings

def extract_word2vec_from_zip(zip_path, extract_dir=None):
    if extract_dir is None:
        extract_dir = os.path.dirname(zip_path)

    print(f"Extracting {zip_path} to {extract_dir}...")

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = zip_ref.namelist()
        print(f"Files in archive: {file_list}")

        bin_files = [f for f in file_list if f.endswith('.bin')]
        if not bin_files:
            raise ValueError("No .bin file found in the zip archive")

        bin_file = bin_files[0]
        print(f"Extracting {bin_file}...")

        zip_ref.extract(bin_file, extract_dir)

    bin_path = os.path.join(extract_dir, bin_file)
    print(f"Extracted to {bin_path}")

    return bin_path

if __name__ == "__main__":
    save_dir = '/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/word2vec'
    os.makedirs(save_dir, exist_ok=True)
    data_dir = '/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained'
    os.makedirs(data_dir, exist_ok=True)

    zip_path = os.path.join(data_dir, "GoogleNews-vectors-negative300.bin.zip")
    bin_path = extract_word2vec_from_zip(zip_path, extract_dir=data_dir)
    save_path = os.path.join(save_dir, "word2vec_googleNews_300d.pkl")
    word2vec_embeddings = load_word2vec_binary(bin_path, save_path=save_path)
    print(f"Loaded and saved {len(word2vec_embeddings)} Word2Vec embeddings to Google Drive")

Extracting /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/GoogleNews-vectors-negative300.bin.zip to /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained...
Files in archive: ['GoogleNews-vectors-negative300.bin']
Extracting GoogleNews-vectors-negative300.bin...
Extracted to /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/GoogleNews-vectors-negative300.bin
Loading embeddings from /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/GoogleNews-vectors-negative300.bin
Vocabulary size: 3000000, Vector dimension: 300
Loaded 0/3000000 word vectors
Loaded 10000/3000000 word vectors
Loaded 20000/3000000 word vectors
Loaded 30000/3000000 word vectors
Loaded 40000/3000000 word vectors
Loaded 50000/3000000 word vectors
Loaded 60000/3000000 word vectors
Loaded 70000/3000000 word vectors
Loaded 80000/3000000 word vectors
Loaded 90000/3000000 word vectors
Loaded 100000/3000000 word vectors
Loaded 110000/3000000 word vectors
Loaded 120000/3000000 wor

### Defining paths

Copy the paths of newly created.pkl files of GLoVe and Word2Vec embeddings in GLOVE_PATH and WORD2VEC_PATH respectively and define two variables to easily reference the paths in the future.

In [2]:
GLOVE_PATH = '/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/glove/glove_6B_300d.pkl'
WORD2VEC_PATH = '/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/word2vec/word2vec_googleNews_300d.pkl'

def load_embeddings(path):
    print(f"Loading embeddings from {path}...")
    with open(path, 'rb') as f:
        embeddings = pickle.load(f)
    print(f"Loaded {len(embeddings)} word vectors")
    return embeddings

glove = load_embeddings(GLOVE_PATH)
word2vec = load_embeddings(WORD2VEC_PATH)

Loading embeddings from /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/glove/glove_6B_300d.pkl...
Loaded 400000 word vectors
Loading embeddings from /content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/word2vec/word2vec_googleNews_300d.pkl...
Loaded 3000000 word vectors


### Inspect embeddings

You can load the path of embeddings file in last line of below cell to inspect its structure and get a better idea of shape and size of the file.

In [3]:
import pickle
import numpy as np

def inspect_pkl_embeddings(filepath):
    with open(filepath, 'rb') as f:
        embeddings = pickle.load(f)

    sample_word, sample_vector = next(iter(embeddings.items()))

    print(f"Total embeddings: {len(embeddings)}")
    print(f"Vector dimension: {sample_vector.shape[0]}")
    print(f"Sample word: '{sample_word}' → Vector shape: {sample_vector.shape}")

inspect_pkl_embeddings("/content/drive/MyDrive/Colab Notebooks/code/models/pre-trained/word2vec/word2vec_googleNews_300d.pkl")

Total embeddings: 3000000
Vector dimension: 300
Sample word: '</s>' → Vector shape: (300,)


# Evaluation

## Helper functions

We define a few helper functions to carry out essential tasks needed for further analysis, so that we don't have to code each function separately every time.

In [4]:
'''
Find and return the vector representation of each word from the pre-trained embeddings dictionary
'''
def get_word_vector(word, embeddings):
    if word in embeddings:
        return embeddings[word]
    if word.lower() in embeddings:
        return embeddings[word.lower()]
    if word.capitalize() in embeddings:
        return embeddings[word.capitalize()]
    if word.upper() in embeddings:
        return embeddings[word.upper()]
    return None

In [5]:
'''
Measure the cosine similarity between two vectors to measure similarity between them
'''
def cosine_sim(vec1, vec2):
    if vec1 is None or vec2 is None:
        return None
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(vec1, vec2) / (norm1 * norm2)

In [6]:
'''
Checks the words against a given embedding dictionary to identify what words are missing ('Out of Vocabulary' words).
'''
def check_vocabulary(word_list, embeddings, embedding_name):
    missing = [word for word in word_list if get_word_vector(word, embeddings) is None]
    if missing:
        print(f"Words missing in {embedding_name}: {missing}")
    return len(word_list) - len(missing)

## WEAT test



*   Word Embedding Association Test (WEAT) is a test to examine the associations in word embeddings between concepts (like bias) [3]. WEAT uses the Implicit Association Test (IAT) from psychology to word embeddings, providing a statistical framework to detect bias [4].
*   With the help of [5], the following test is done for two sets of target words (X,Y) and attribute words (A,B) and effect_size (which measures bias magnitude) and p-value (statistical significance) are computed.

*   The format 'Target, Attribute' is chosen to facilitate easy evaluation of how strongly the concepts(gender specific roles) are associated with its attributes (profession, feeling etc).









### WEAT test function

In [None]:
def weat_test(X, Y, A, B, embeddings):
    X_valid = [x for x in X if get_word_vector(x, embeddings) is not None]
    Y_valid = [y for y in Y if get_word_vector(y, embeddings) is not None]
    A_valid = [a for a in A if get_word_vector(a, embeddings) is not None]
    B_valid = [b for b in B if get_word_vector(b, embeddings) is not None]

    if len(X_valid) == 0 or len(Y_valid) == 0 or len(A_valid) == 0 or len(B_valid) == 0:
        return None, None

    # association between each target word and attribute sets
    def association(w, A, B):
        w_vec = get_word_vector(w, embeddings)
        A_sims = [cosine_sim(w_vec, get_word_vector(a, embeddings)) for a in A_valid]
        B_sims = [cosine_sim(w_vec, get_word_vector(b, embeddings)) for b in B_valid]
        return np.mean(A_sims) - np.mean(B_sims)

    X_associations = [association(x, A_valid, B_valid) for x in X_valid]
    Y_associations = [association(y, A_valid, B_valid) for y in Y_valid]
    test_statistic = np.mean(X_associations) - np.mean(Y_associations)

    # effect size
    std_dev = np.std(X_associations + Y_associations)
    effect_size = test_statistic / std_dev if std_dev > 0 else 0

    all_targets = X_valid + Y_valid
    size_X = len(X_valid)
    n_samples = 1000
    permutation_scores = []

    for _ in range(n_samples):
        np.random.shuffle(all_targets)
        X_i = all_targets[:size_X]
        Y_i = all_targets[size_X:]
        X_i_associations = [association(x, A_valid, B_valid) for x in X_i]
        Y_i_associations = [association(y, A_valid, B_valid) for y in Y_i]
        permutation_scores.append(np.mean(X_i_associations) - np.mean(Y_i_associations))

    # p-value
    p_value = np.mean([score >= test_statistic for score in permutation_scores])

    return effect_size, p_value

### WEAT test sets for selected biases - Gender and Racial

In [None]:
# Define WEAT test sets for gender
weat_gender = {
    'target': {
        'X': ['man', 'boy', 'father', 'male', 'gentleman', 'son', 'he', 'his', 'him'],
        'Y': ['woman', 'girl', 'mother', 'female', 'lady', 'daughter', 'she', 'hers', 'her']
    },
    'attribute': {
        'A': ['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
        'B': ['literature', 'arts', 'poetry', 'dance', 'literature', 'novel', 'symphony', 'drama']
    }
}

In [None]:
# Define WEAT test sets for race
weat_race = {
    'target': {
        'X': ['european', 'caucasian', 'white'],
        'Y': ['african', 'black']
    },
    'attribute': {
        'A': ['pleasant', 'nice', 'good', 'wonderful', 'excellent', 'happy', 'joy'],
        'B': ['unpleasant', 'bad', 'terrible', 'horrible', 'awful', 'sad', 'anger']
    }
}

In [None]:
gender_targets = {
    'male_terms': ['male', 'man', 'boy', 'brother', 'he', 'father', 'son', 'uncle'],
    'female_terms': ['female', 'woman', 'girl', 'sister', 'she', 'mother', 'daughter', 'aunt']
}

gender_attributes = {
    'career': ['executive', 'management', 'professional', 'salary', 'office', 'boss', 'business'],
    'family': ['home', 'parents', 'children', 'family', 'marriage', 'wedding', 'kitchen']
}

racial_targets = {
    'european_american': ['Adam', 'Harry', 'Roger', 'Alan', 'Ryan', 'Neil', 'Brad'],
    'african_american': ['Darnell', 'Hakim', 'Jermaine', 'Tyrone', 'Leroy', 'Rasheed', 'DeShawn']
}

racial_attributes = {
    'pleasant': ['joy', 'love', 'peace', 'wonderful', 'pleasure', 'happy', 'laughter'],
    'unpleasant': ['agony', 'terrible', 'horrible', 'nasty', 'evil', 'war', 'awful']
}

### Running the test

We call the WEAT test function and run bias test for Gender and Race separately. We evaluate the bias for these two features on the pre-trained embeddings of GLoVe and Word2vec.

In [None]:
def run_weat_tests(embeddings, name):
    print(f"\nRunning WEAT tests on {name}:")

    # Gender test
    effect_size, p_value = weat_test(
        weat_gender['target']['X'],
        weat_gender['target']['Y'],
        weat_gender['attribute']['A'],
        weat_gender['attribute']['B'],
        embeddings
    )
    print(f"Gender bias (Career vs Family): Effect size = {effect_size:.4f}, p-value = {p_value:.4f}")

    # Race test
    effect_size, p_value = weat_test(
        weat_race['target']['X'],
        weat_race['target']['Y'],
        weat_race['attribute']['A'],
        weat_race['attribute']['B'],
        embeddings
    )
    print(f"Racial bias (Pleasant vs Unpleasant): Effect size = {effect_size:.4f}, p-value = {p_value:.4f}")

if __name__ == "__main__":
    run_weat_tests(word2vec, 'Word2Vec')
    run_weat_tests(glove, 'GloVe')


Running WEAT tests on Word2Vec:
Gender bias (Career vs Family): Effect size = 1.2232, p-value = 0.0020
Racial bias (Pleasant vs Unpleasant): Effect size = 1.5022, p-value = 0.1070

Running WEAT tests on GloVe:
Gender bias (Career vs Family): Effect size = 0.7009, p-value = 0.0720
Racial bias (Pleasant vs Unpleasant): Effect size = 0.0913, p-value = 0.6170


## Direct bias

We measure direct bias in word embeddings by projecting specific words onto gender direction here. This approach is based on the paper [4] (Bolukbasi et al.) where we calculate gender direction and normalize it, select few words to evaluate the bias for each category.

A projection is defined as the dot product with the calculated gender direction.

**How direct bias works** ? We define a custom bias direction in the vector space and measure how strongly the defined words project onto this direction.

### Gender Bias

We use direct bias to first define a gender direction using word pairs, calculate a gender direction and then calculate bias scores by averaging the gender direction [9][10].

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

def measure_direct_bias(embeddings, name):
    # define gender direction using word pairs
    gender_pairs = [
        ('he', 'she'),
        ('man', 'woman'),
        ('father', 'mother'),
        ('boy', 'girl'),
        ('son', 'daughter'),
        ('husband', 'wife'),
        ('gentleman', 'lady'),
        ('uncle', 'aunt')
    ]

    # calculate gender direction
    gender_vectors = []
    for male, female in gender_pairs:
        male_vec = get_word_vector(male, embeddings)
        female_vec = get_word_vector(female, embeddings)

        if male_vec is not None and female_vec is not None:
            male_vec = male_vec / np.linalg.norm(male_vec) # normalize vectors
            female_vec = female_vec / np.linalg.norm(female_vec)
            gender_vectors.append(male_vec - female_vec)

    if not gender_vectors:
        print("Could not calculate gender direction: no valid gender pairs found")
        return None

    # average gender direction
    gender_direction = np.mean(gender_vectors, axis=0)
    gender_direction = gender_direction / np.linalg.norm(gender_direction)

    # selecting few words to evaluate for bias
    profession_words = [
        'doctor', 'nurse', 'engineer', 'teacher', 'programmer', 'artist',
        'scientist', 'writer', 'ceo', 'assistant', 'manager', 'secretary',
        'carpenter', 'chef', 'designer', 'accountant', 'lawyer', 'banker',
        'receptionist', 'journalist', 'professor', 'researcher', 'pilot', 'dancer'
    ]

    # calculate direct bias
    results = []
    for word in profession_words:
        vec = get_word_vector(word, embeddings)
        if vec is not None:
            vec = vec / np.linalg.norm(vec)
            bias = np.dot(vec, gender_direction) # project onto gender direction
            results.append({'word': word, 'bias': bias})

    if not results:
        print("No valid profession words found for direct bias calculation")
        return None

    df = pd.DataFrame(results)
    df = df.sort_values(by='bias', ascending=False)

    print("\nMost male-associated professions:") # output most biased words
    for _, row in df.head(5).iterrows():
        print(f"  {row['word']}: {row['bias']:.4f}")

    print("\nMost female-associated professions:")
    for _, row in df.tail(5).iloc[::-1].iterrows():
        print(f"  {row['word']}: {row['bias']:.4f}")

    avg_abs_bias = df['bias'].abs().mean()
    print(f"\nAverage absolute direct bias: {avg_abs_bias:.4f}")

    plt.figure(figsize=(12, 8))
    colors = ['blue' if x > 0 else 'red' for x in df['bias']]
    bars = plt.bar(df['word'], df['bias'], color=colors)
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.xlabel('Profession')
    plt.ylabel('Direct Bias (projection onto gender direction)')
    plt.title(f'Direct Gender Bias in {name} Embeddings')
    plt.xticks(rotation=45, ha='right')
    plt.legend([plt.Rectangle((0,0),1,1,color='blue'),
                plt.Rectangle((0,0),1,1,color='red')],
               ['Male-associated', 'Female-associated'])
    plt.tight_layout()
    plt.savefig(f'direct_bias_{name}.png')
    plt.close()
    return df

In [None]:
if __name__ == "__main__":
    glove_bias = measure_direct_bias(glove, 'GloVe')
    word2vec_bias = measure_direct_bias(word2vec, 'Word2Vec')

    if glove_bias is not None and word2vec_bias is not None:
        merged = pd.merge(
            glove_bias.rename(columns={'bias': 'glove_bias'}),
            word2vec_bias.rename(columns={'bias': 'word2vec_bias'}),
            on='word'
        )

        correlation = np.corrcoef(merged['glove_bias'], merged['word2vec_bias'])[0, 1]
        print(f"\nCorrelation between GloVe and Word2Vec direct bias: {correlation:.4f}")

        plt.figure(figsize=(10, 8))
        plt.scatter(merged['glove_bias'], merged['word2vec_bias'], alpha=0.7)

        min_val = min(merged['glove_bias'].min(), merged['word2vec_bias'].min())
        max_val = max(merged['glove_bias'].max(), merged['word2vec_bias'].max())
        plt.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5)

        for _, row in merged.iterrows():
            plt.annotate(row['word'],
                         (row['glove_bias'], row['word2vec_bias']),
                         xytext=(5, 5), textcoords='offset points')

        plt.xlabel('GloVe Direct Bias')
        plt.ylabel('Word2Vec Direct Bias')
        plt.title('Comparison of Direct Gender Bias Between Embeddings')
        plt.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
        plt.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig('direct_bias_comparison.png')
        plt.close()



Most male-associated professions:
  engineer: 0.2168
  programmer: 0.1506
  banker: 0.1381
  pilot: 0.1376
  ceo: 0.1205

Most female-associated professions:
  nurse: -0.2720
  receptionist: -0.2309
  dancer: -0.1845
  researcher: -0.0962
  teacher: -0.0705

Average absolute direct bias: 0.0918

Most male-associated professions:
  carpenter: 0.2207
  engineer: 0.1882
  ceo: 0.0723
  banker: 0.0652
  manager: 0.0513

Most female-associated professions:
  nurse: -0.3150
  receptionist: -0.2902
  dancer: -0.2148
  teacher: -0.1628
  designer: -0.0722

Average absolute direct bias: 0.0887

Correlation between GloVe and Word2Vec direct bias: 0.9006


### Racial Bias

We use Direct bias to first create a racial direction and then normalize the direction vectors to determine the projection scores [11]. Positive scores indicate association with one racial group while negative scores indicate association with other racial group.

In [10]:
def measure_racial_direct_bias(embeddings, name):
    print(f"\nMeasuring direct racial bias in {name} embeddings:")
    # define racial direction using pairs
    race_pairs = [
        ('white', 'black'),
        ('caucasian', 'african'),
        ('european', 'african-american'),
        ('white', 'asian'),
        ('caucasian', 'hispanic'),
        ('european', 'latino')
    ]

    race_vectors = [] # calculate racial direction

    print("Building racial direction vector...")
    for race1, race2 in race_pairs:
        race1_vec = get_word_vector(race1, embeddings)
        race2_vec = get_word_vector(race2, embeddings)

        if race1_vec is not None and race2_vec is not None:
            race1_vec = race1_vec / np.linalg.norm(race1_vec) # normalize vectors
            race2_vec = race2_vec / np.linalg.norm(race2_vec)
            race_vectors.append(race1_vec - race2_vec)
            print(f"  Added direction: {race1} - {race2}")

    if not race_vectors:
        print("Could not calculate racial direction: no valid race pairs found")
        return None

    racial_direction = np.mean(race_vectors, axis=0)
    racial_direction = racial_direction / np.linalg.norm(racial_direction)

    eval_words = [
        'intelligent', 'smart', 'educated', 'skilled', 'capable', 'competent',
        'hardworking', 'ambitious', 'successful', 'wealthy', 'rich',
        'poor', 'criminal', 'dangerous', 'violent', 'lazy', 'uneducated', 'welfare',
        'professional', 'leader', 'doctor', 'lawyer', 'scientist', 'engineer',
        'teacher', 'nurse', 'athlete', 'musician', 'artist', 'journalist',
        'police', 'chef', 'worker', 'janitor', 'assistant', 'manager',
        'trustworthy', 'honest', 'religious', 'family', 'community', 'urban',
        'rural', 'friendly', 'aggressive', 'loud', 'quiet', 'disciplined'
    ]

    results = []
    for word in eval_words:
        vec = get_word_vector(word, embeddings)
        if vec is not None:
            vec = vec / np.linalg.norm(vec)
            bias = np.dot(vec, racial_direction)
            results.append({'word': word, 'bias': bias})

    if not results:
        print("No valid evaluation words found for direct bias calculation")
        return None

    df = pd.DataFrame(results)
    df = df.sort_values(by='bias', ascending=False)

    print("\nWords most associated with White/European (positive projection):")
    for _, row in df.head(10).iterrows():
        print(f"  {row['word']}: {row['bias']:.4f}")

    print("\nWords most associated with Black/African American groups (negative projection):")
    for _, row in df.tail(10).iloc[::-1].iterrows():
        print(f"  {row['word']}: {row['bias']:.4f}")

    avg_abs_bias = df['bias'].abs().mean()
    print(f"\nAverage absolute direct racial bias: {avg_abs_bias:.4f}")

    # categorize words for visualization
    categories = {
        'Positive Attributes': ['intelligent', 'smart', 'educated', 'skilled', 'capable',
                                'competent', 'hardworking', 'ambitious', 'successful',
                                'wealthy', 'rich'],
        'Negative Attributes': ['poor', 'criminal', 'dangerous', 'violent', 'lazy',
                               'uneducated', 'welfare'],
        'Professions': ['professional', 'leader', 'doctor', 'lawyer', 'scientist',
                        'engineer', 'teacher', 'nurse', 'athlete', 'musician', 'artist',
                        'journalist', 'police', 'chef', 'worker', 'janitor', 'assistant',
                        'manager'],
        'Personality': ['trustworthy', 'honest', 'religious', 'family', 'community',
                        'urban', 'rural', 'friendly', 'aggressive', 'loud', 'quiet',
                        'disciplined']
    }

    df['category'] = 'Other'
    for category, words in categories.items():
        df.loc[df['word'].isin(words), 'category'] = category

    plt.figure(figsize=(14, 10))
    sns.set_palette("Set1") # plots bias values by category
    ax = sns.barplot(x='word', y='bias', hue='category', data=df)
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.xlabel('Word')
    plt.ylabel('Direct Racial Bias')
    plt.title(f'Direct Racial Bias in {name} Embeddings')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'direct_racial_bias_{name}.png')
    plt.close()

    # plots average bias by category
    category_avg = df.groupby('category')['bias'].mean().reset_index()
    category_std = df.groupby('category')['bias'].std().reset_index()

    plt.figure(figsize=(10, 6))
    bars = plt.bar(category_avg['category'], category_avg['bias'])

    plt.errorbar(x=category_avg['category'], y=category_avg['bias'],
                 yerr=category_std['bias'], fmt='none', capsize=5, color='black')

    for i, bar in enumerate(bars):
        if category_avg['bias'].iloc[i] > 0:
            bar.set_color('darkorange')
        else:
            bar.set_color('steelblue')

    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title(f'Average Racial Bias by Category in {name} Embeddings')
    plt.ylabel('Average Bias (+ = White/European Association)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'racial_bias_by_category_{name}.png')
    plt.close()
    return df

In [11]:
def compare_direct_racial_bias(glove_bias, word2vec_bias):
    if glove_bias is None or word2vec_bias is None:
        print("Cannot compare biases: one or both bias measurements are missing")
        return

    merged = pd.merge(
        glove_bias[['word', 'bias', 'category']].rename(columns={'bias': 'glove_bias'}),
        word2vec_bias[['word', 'bias']].rename(columns={'bias': 'word2vec_bias'}),
        on='word', how='inner'
    )

    # calculate correlation
    correlation = np.corrcoef(merged['glove_bias'], merged['word2vec_bias'])[0, 1]
    print(f"\nCorrelation between GloVe and Word2Vec direct racial bias: {correlation:.4f}")

    plt.figure(figsize=(12, 10))

    # plot by category
    categories = merged['category'].unique()
    colors = sns.color_palette("Set1", len(categories))

    for i, category in enumerate(categories):
        category_data = merged[merged['category'] == category]
        plt.scatter(
            category_data['glove_bias'],
            category_data['word2vec_bias'],
            label=category,
            color=colors[i],
            alpha=0.7,
            s=70
        )

    m, b = np.polyfit(merged['glove_bias'], merged['word2vec_bias'], 1)
    x_range = np.array([merged['glove_bias'].min(), merged['glove_bias'].max()])
    plt.plot(x_range, m*x_range + b, 'k--', alpha=0.7, linewidth=2)

    min_val = min(merged['glove_bias'].min(), merged['word2vec_bias'].min())
    max_val = max(merged['glove_bias'].max(), merged['word2vec_bias'].max())
    plt.plot([min_val, max_val], [min_val, max_val], 'k-', alpha=0.3)

    for i, row in merged.iterrows():
        if (abs(row['glove_bias']) > 0.15 or
            abs(row['word2vec_bias']) > 0.15 or
            abs(row['glove_bias'] - row['word2vec_bias']) > 0.1):
            plt.annotate(row['word'],
                        (row['glove_bias'], row['word2vec_bias']),
                        xytext=(5, 5), textcoords='offset points',
                        fontsize=9)

    plt.xlabel('GloVe Direct Racial Bias')
    plt.ylabel('Word2Vec Direct Racial Bias')
    plt.title('Comparison of Direct Racial Bias Between Embeddings')
    plt.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    plt.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
    plt.grid(alpha=0.3)
    plt.legend(title="Category")
    plt.tight_layout()
    plt.savefig('direct_racial_bias_comparison.png')
    plt.close()

    # Finding which embedding shows stronger bias for each word
    merged['abs_glove'] = np.abs(merged['glove_bias'])
    merged['abs_word2vec'] = np.abs(merged['word2vec_bias'])
    merged['stronger_bias'] = np.where(
        merged['abs_glove'] > merged['abs_word2vec'],
        'GloVe',
        'Word2Vec'
    )

    # count by category
    bias_by_category = merged.groupby(['category', 'stronger_bias']).size().unstack()
    print("\nWords with stronger bias by category and embedding type:")
    print(bias_by_category)

    # calculate agreement in bias direction
    merged['same_direction'] = np.sign(merged['glove_bias']) == np.sign(merged['word2vec_bias'])
    agreement = merged['same_direction'].mean() * 100
    print(f"\nPercent of words with same bias direction in both embeddings: {agreement:.1f}%")

    # calculate agreements by category
    agreement_by_category = merged.groupby('category')['same_direction'].mean() * 100
    print("\nAgreement in bias direction by category:")
    for category, agreement in agreement_by_category.items():
        print(f"  {category}: {agreement:.1f}%")

    return merged

In [12]:
if __name__ == "__main__":
  glove_bias = measure_racial_direct_bias(glove, 'GloVe')
  word2vec_bias = measure_racial_direct_bias(word2vec, 'Word2Vec')
  comparison = compare_direct_racial_bias(glove_bias, word2vec_bias)
  print("\nRacial bias evaluation completed successfully!")


Measuring direct racial bias in GloVe embeddings:
Building racial direction vector...
  Added direction: white - black
  Added direction: caucasian - african
  Added direction: european - african-american
  Added direction: white - asian
  Added direction: caucasian - hispanic
  Added direction: european - latino

Words most associated with White/European (positive projection):
  friendly: 0.0938
  dangerous: 0.0455
  aggressive: 0.0455
  quiet: 0.0310
  manager: 0.0231
  leader: 0.0144
  loud: 0.0081
  engineer: 0.0069
  lazy: 0.0057
  criminal: 0.0006

Words most associated with Black/African American groups (negative projection):
  community: -0.2328
  urban: -0.2295
  uneducated: -0.1894
  rural: -0.1695
  musician: -0.1668
  poor: -0.1535
  skilled: -0.1443
  hardworking: -0.1357
  successful: -0.1136
  artist: -0.1135

Average absolute direct racial bias: 0.0724

Measuring direct racial bias in Word2Vec embeddings:
Building racial direction vector...
  Added direction: white - b

## t-SNE

*  t-distributed Stochastic Neighbor Embedding, is an algorithm used for visualizing high-dimensional data by reducing it to two (in this case) dimensions while preserving the relationships between data points.
*   We perform t-SNE to understand the clustering patterns between words that indicate bias, showing how certain words cluster with gender-biased/racial-biased professions or attributes.



In [None]:
def cluster_analysis_tsne(embeddings, target_words, title, filename):
    word_vectors = []
    labels = []
    categories = []

    for category, words in target_words.items():
        for word in words:
            vector = get_word_vector(word, embeddings)
            if vector is not None:
                word_vectors.append(vector)
                labels.append(word)
                categories.append(category)

    if len(word_vectors) < 5:
        print(f"Not enough words found in embeddings for {title}")
        return

    # t-SNE
    word_vectors = np.array(word_vectors)
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(word_vectors)-1))
    reduced_vectors = tsne.fit_transform(word_vectors)

    # create DataFrame for plotting
    df = pd.DataFrame({
        'x': reduced_vectors[:, 0],
        'y': reduced_vectors[:, 1],
        'word': labels,
        'category': categories
    })

    plt.figure(figsize=(12, 10))

    palette = sns.color_palette("husl", len(target_words))
    categories_list = list(target_words.keys())
    color_dict = {cat: palette[i] for i, cat in enumerate(categories_list)}

    for category in categories_list:
        category_df = df[df['category'] == category]
        plt.scatter(category_df['x'], category_df['y'], label=category,
                   color=color_dict[category], alpha=0.7)

    for i, row in df.iterrows():
        plt.annotate(row['word'], (row['x'], row['y']), fontsize=9,
                    xytext=(5, 5), textcoords='offset points')

    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# define word sets for cluster analysis
gender_occupation_words = {
    'male': ['man', 'boy', 'father', 'brother', 'son', 'husband', 'uncle', 'grandfather'],
    'female': ['woman', 'girl', 'mother', 'sister', 'daughter', 'wife', 'aunt', 'grandmother'],
    'stem': ['scientist', 'engineer', 'mathematician', 'programmer', 'researcher', 'doctor', 'physicist', 'chemist'],
    'humanities': ['artist', 'writer', 'poet', 'dancer', 'singer', 'actor', 'designer', 'philosopher'],
    'leadership': ['ceo', 'boss', 'executive', 'director', 'manager', 'supervisor', 'leader', 'president']
}

race_attribute_words = {
    'white': ['european', 'caucasian', 'white', 'western', 'anglo'],
    'black': ['african', 'black', 'afro'],
    'asian': ['asian', 'chinese', 'japanese', 'korean', 'eastern'],
    'positive': ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'],
    'negative': ['bad', 'awful', 'terrible', 'negative', 'unfortunate', 'wrong', 'inferior']
}

In [None]:
def run_cluster_analysis(embeddings, name):
    print(f"\nRunning t-SNE cluster analysis on {name}:")

    # Gender-occupation analysis
    cluster_analysis_tsne(
        embeddings,
        gender_occupation_words,
        f'Gender, Occupation & Leadership Clusters in {name}',
        f'tsne_gender_occupation_{name}.png'
    )

    # Race-attribute analysis
    cluster_analysis_tsne(
        embeddings,
        race_attribute_words,
        f'Race & Attribute Clusters in {name}',
        f'tsne_race_attribute_{name}.png'
    )

if __name__=="__main__":
  run_cluster_analysis(word2vec, 'Word2Vec')
  run_cluster_analysis(glove, 'GLoVe')


Running t-SNE cluster analysis on Word2Vec:

Running t-SNE cluster analysis on GLoVe:


## Cross Embedding Comparison

#### Gender Bias

In [None]:
def compare_embeddings(glove_embeddings, word2vec_embeddings):
    gender_pairs = [
        ('he', 'she'),
        ('man', 'woman'),
        ('father', 'mother'),
        ('boy', 'girl'),
        ('son', 'daughter'),
        ('husband', 'wife'),
        ('gentleman', 'lady'),
        ('uncle', 'aunt')
    ]

    profession_words = [
        'doctor', 'nurse', 'engineer', 'teacher', 'programmer', 'artist',
        'scientist', 'writer', 'ceo', 'assistant', 'manager', 'secretary'
    ]

    # gender bias for each profession across embedding types
    results = []

    for profession in profession_words:
        glove_gender_biases = []
        word2vec_gender_biases = []

        for male, female in gender_pairs:
            prof_vec_glove = get_word_vector(profession, glove_embeddings)
            male_vec_glove = get_word_vector(male, glove_embeddings)
            female_vec_glove = get_word_vector(female, glove_embeddings)

            if prof_vec_glove is not None and male_vec_glove is not None and female_vec_glove is not None:
                male_sim_glove = cosine_sim(prof_vec_glove, male_vec_glove)
                female_sim_glove = cosine_sim(prof_vec_glove, female_vec_glove)
                glove_gender_biases.append(male_sim_glove - female_sim_glove)

            prof_vec_w2v = get_word_vector(profession, word2vec_embeddings)
            male_vec_w2v = get_word_vector(male, word2vec_embeddings)
            female_vec_w2v = get_word_vector(female, word2vec_embeddings)

            if prof_vec_w2v is not None and male_vec_w2v is not None and female_vec_w2v is not None:
                male_sim_w2v = cosine_sim(prof_vec_w2v, male_vec_w2v)
                female_sim_w2v = cosine_sim(prof_vec_w2v, female_vec_w2v)
                word2vec_gender_biases.append(male_sim_w2v - female_sim_w2v)

        if glove_gender_biases and word2vec_gender_biases:
            results.append({
                'profession': profession,
                'glove_bias': np.mean(glove_gender_biases),
                'word2vec_bias': np.mean(word2vec_gender_biases)
            })

    if results:
        df = pd.DataFrame(results)

        plt.figure(figsize=(12, 8))

        x = np.arange(len(df))
        width = 0.35

        plt.bar(x - width/2, df['glove_bias'], width, label='GloVe')
        plt.bar(x + width/2, df['word2vec_bias'], width, label='Word2Vec')

        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Profession')
        plt.ylabel('Gender Bias (Male - Female Similarity)')
        plt.title('Comparison of Gender Bias Across Embedding Types')
        plt.xticks(x, df['profession'], rotation=45)
        plt.legend()
        plt.tight_layout()
        plt.savefig('embedding_comparison.png')
        plt.close()

        correlation = np.corrcoef(df['glove_bias'], df['word2vec_bias'])[0, 1]
        print(f"Correlation between GloVe and Word2Vec gender biases: {correlation:.4f}")

        print("\nMost male-biased professions:")
        for embed_type in ['glove_bias', 'word2vec_bias']:
            top_male_biased = df.sort_values(by=embed_type, ascending=False).head(3)
            embed_name = 'GloVe' if embed_type == 'glove_bias' else 'Word2Vec'
            print(f"\n{embed_name}:")
            for _, row in top_male_biased.iterrows():
                print(f"  {row['profession']}: {row[embed_type]:.4f}")

        print("\nMost female-biased professions:")
        for embed_type in ['glove_bias', 'word2vec_bias']:
            top_female_biased = df.sort_values(by=embed_type, ascending=True).head(3)
            embed_name = 'GloVe' if embed_type == 'glove_bias' else 'Word2Vec'
            print(f"\n{embed_name}:")
            for _, row in top_female_biased.iterrows():
                print(f"  {row['profession']}: {row[embed_type]:.4f}")

        glove_avg_bias = df['glove_bias'].abs().mean()
        word2vec_avg_bias = df['word2vec_bias'].abs().mean()
        print(f"\nAverage absolute gender bias:")
        print(f"GloVe: {glove_avg_bias:.4f}")
        print(f"Word2Vec: {word2vec_avg_bias:.4f}")

        stronger_bias_counts = {'GloVe': 0, 'Word2Vec': 0, 'Equal': 0}
        for _, row in df.iterrows():
            glove_abs = abs(row['glove_bias'])
            word2vec_abs = abs(row['word2vec_bias'])

            if abs(glove_abs - word2vec_abs) < 0.01:
                stronger_bias_counts['Equal'] += 1
            elif glove_abs > word2vec_abs:
                stronger_bias_counts['GloVe'] += 1
            else:
                stronger_bias_counts['Word2Vec'] += 1

        print("\nWhich embedding shows stronger bias:")
        for embed, count in stronger_bias_counts.items():
            print(f"{embed}: {count} professions")

    else:
        print("No results to display. Check if words exist in both embeddings.")

    return df if results else None

if __name__=="__main__":
  compare_embeddings(glove, word2vec)

Correlation between GloVe and Word2Vec gender biases: 0.9506

Most male-biased professions:

GloVe:
  engineer: 0.1175
  programmer: 0.0816
  ceo: 0.0653

Word2Vec:
  engineer: 0.0936
  ceo: 0.0359
  manager: 0.0255

Most female-biased professions:

GloVe:
  nurse: -0.1474
  teacher: -0.0382
  writer: -0.0026

Word2Vec:
  nurse: -0.1566
  teacher: -0.0809
  artist: -0.0248

Average absolute gender bias:
GloVe: 0.0502
Word2Vec: 0.0421

Which embedding shows stronger bias:
GloVe: 5 professions
Word2Vec: 4 professions
Equal: 3 professions


#### Racial bias

In [None]:
def compare_racial_bias(glove_embeddings, word2vec_embeddings):
    print("\nComparing racial bias across embedding types:")

    race_pairs = [
        ('white', 'black'),
        ('caucasian', 'african'),
        ('european', 'african-american'),
        ('white', 'asian'),
        ('caucasian', 'hispanic'),
        ('european', 'latino')
    ]

    attribute_words = [
        'intelligent', 'smart', 'educated', 'skilled', 'capable', 'competent',
        'hardworking', 'ambitious', 'successful', 'wealthy', 'rich', 'poor',
        'criminal', 'dangerous', 'violent', 'lazy', 'uneducated', 'welfare',
        'professional', 'leader', 'doctor', 'lawyer', 'scientist', 'engineer',
        'trustworthy', 'honest', 'religious', 'athletic', 'musical', 'artistic'
    ]

    results = []

    for attribute in attribute_words:
        glove_race_biases = []
        word2vec_race_biases = []

        for race1, race2 in race_pairs:
            attr_vec_glove = get_word_vector(attribute, glove_embeddings)
            race1_vec_glove = get_word_vector(race1, glove_embeddings)
            race2_vec_glove = get_word_vector(race2, glove_embeddings)

            if attr_vec_glove is not None and race1_vec_glove is not None and race2_vec_glove is not None:
                race1_sim_glove = cosine_sim(attr_vec_glove, race1_vec_glove)
                race2_sim_glove = cosine_sim(attr_vec_glove, race2_vec_glove)
                glove_race_biases.append(race1_sim_glove - race2_sim_glove)

            attr_vec_w2v = get_word_vector(attribute, word2vec_embeddings)
            race1_vec_w2v = get_word_vector(race1, word2vec_embeddings)
            race2_vec_w2v = get_word_vector(race2, word2vec_embeddings)

            if attr_vec_w2v is not None and race1_vec_w2v is not None and race2_vec_w2v is not None:
                race1_sim_w2v = cosine_sim(attr_vec_w2v, race1_vec_w2v)
                race2_sim_w2v = cosine_sim(attr_vec_w2v, race2_vec_w2v)
                word2vec_race_biases.append(race1_sim_w2v - race2_sim_w2v)

        if glove_race_biases and word2vec_race_biases:
            results.append({
                'attribute': attribute,
                'glove_bias': np.mean(glove_race_biases),
                'word2vec_bias': np.mean(word2vec_race_biases)
            })

    if results:
        df = pd.DataFrame(results)
        df['avg_bias'] = (df['glove_bias'] + df['word2vec_bias']) / 2
        df = df.sort_values(by='avg_bias', ascending=False).drop('avg_bias', axis=1)

        plt.figure(figsize=(14, 10))
        x = np.arange(len(df))
        width = 0.35

        plt.bar(x - width/2, df['glove_bias'], width, label='GloVe')
        plt.bar(x + width/2, df['word2vec_bias'], width, label='Word2Vec')

        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Attribute')
        plt.ylabel('Racial Bias (White/European - Black/African/Asian/Latino Similarity)')
        plt.title('Comparison of Racial Bias Across Embedding Types')
        plt.xticks(x, df['attribute'], rotation=45, ha='right')
        plt.legend()
        plt.tight_layout()
        plt.savefig('racial_bias_comparison.png')
        plt.close()

        correlation = np.corrcoef(df['glove_bias'], df['word2vec_bias'])[0, 1]
        print(f"Correlation between GloVe and Word2Vec racial biases: {correlation:.4f}")

        print("\nAttributes most biased toward White/European people:")
        for embed_type in ['glove_bias', 'word2vec_bias']:
            top_biased = df.sort_values(by=embed_type, ascending=False).head(5)
            embed_name = 'GloVe' if embed_type == 'glove_bias' else 'Word2Vec'
            print(f"\n{embed_name}:")
            for _, row in top_biased.iterrows():
                print(f"  {row['attribute']}: {row[embed_type]:.4f}")

        print("\nAttributes most biased toward Black/African/Asian/Latino people:")
        for embed_type in ['glove_bias', 'word2vec_bias']:
            bottom_biased = df.sort_values(by=embed_type, ascending=True).head(5)
            embed_name = 'GloVe' if embed_type == 'glove_bias' else 'Word2Vec'
            print(f"\n{embed_name}:")
            for _, row in bottom_biased.iterrows():
                print(f"  {row['attribute']}: {row[embed_type]:.4f}")

        glove_avg_bias = df['glove_bias'].abs().mean()
        word2vec_avg_bias = df['word2vec_bias'].abs().mean()
        print(f"\nAverage absolute racial bias:")
        print(f"GloVe: {glove_avg_bias:.4f}")
        print(f"Word2Vec: {word2vec_avg_bias:.4f}")
    else:
        print("No results to display. Check if words exist in both embeddings.")

    return df if results else None

if __name__=="__main__":
  compare_racial_bias(glove, word2vec)


Comparing racial bias across embedding types:
Correlation between GloVe and Word2Vec racial biases: 0.4319

Attributes most biased toward White/European people:

GloVe:
  dangerous: 0.0277
  leader: 0.0088
  engineer: 0.0042
  lazy: 0.0034
  criminal: 0.0004

Word2Vec:
  capable: 0.0638
  trustworthy: 0.0501
  lawyer: 0.0488
  dangerous: 0.0449
  competent: 0.0413

Attributes most biased toward Black/African/Asian/Latino people:

GloVe:
  uneducated: -0.1154
  poor: -0.0935
  skilled: -0.0879
  hardworking: -0.0827
  musical: -0.0764

Word2Vec:
  artistic: -0.0299
  uneducated: -0.0285
  honest: -0.0283
  poor: -0.0258
  ambitious: -0.0234

Average absolute racial bias:
GloVe: 0.0392
Word2Vec: 0.0245


## Extrinsic Evaluation via Sentiment Analysis



*   We'll evaluate bias in downstream tasks by analyzing how embeddings impact sentiment analysis predictions.
*   We create standardardized templates with gender/race terms and particular topics and use them to test gender-specific or race-specific words against these templates.

*   This is done to understand how classification accuracy differs across genders/races and observe if models predict better for certain groups.










### Gender Bias

In [None]:
def get_sentiment_dataset():
    templates = [
        "The {gender} talked about the {topic}.",
        "The {gender} considered the {topic}.",
        "The {gender} reviewed the {topic}.",
        "The {gender} mentioned the {topic}."
    ]

    male_terms = ["man", "father", "brother", "son", "uncle", "grandfather", "husband", "boyfriend"]
    female_terms = ["woman", "mother", "sister", "daughter", "aunt", "grandmother", "wife", "girlfriend"]

    positive_terms = ["happy", "pleased", "satisfied", "delighted", "glad", "excited", "content", "optimistic"]
    negative_terms = ["sad", "displeased", "dissatisfied", "unhappy", "disappointed", "upset", "distressed", "pessimistic"]

    topics = ["movie", "dinner", "meeting", "project", "trip", "news", "presentation", "article"]

    data = []

    for template in templates:
        for gender_list, gender_label in [(male_terms, "male"), (female_terms, "female")]:
            for gender in gender_list:
                for positive in positive_terms:
                    for topic in topics:
                        sentence = template.format(gender=gender, sentiment=positive, topic=topic)
                        data.append((sentence, 1, gender_label))  # 1 = positive

                for negative in negative_terms:
                    for topic in topics:
                        sentence = template.format(gender=gender, sentiment=negative, topic=topic)
                        data.append((sentence, 0, gender_label))  # 0 = negative

    random.shuffle(data)
    sentences, labels, gender_labels = zip(*data)
    return sentences, np.array(labels), np.array(gender_labels)

In [None]:
def sentence_to_embedding(sentence, embeddings):
    words = sentence.lower().split()
    vectors = [get_word_vector(word, embeddings) for word in words]
    vectors = [v for v in vectors if v is not None]
    if not vectors:
        return None

    return np.mean(vectors, axis=0)

In [None]:
def evaluate_sentiment_bias(embeddings, name):
    print(f"\nEvaluating sentiment analysis bias with {name}:")

    sentences, labels, gender_labels = get_sentiment_dataset()
    X = []
    valid_indices = []

    for i, sentence in enumerate(sentences):
        embedding = sentence_to_embedding(sentence, embeddings)
        if embedding is not None:
            X.append(embedding)
            valid_indices.append(i)

    if not X:
        print(f"No valid embeddings found for sentences using {name}")
        return

    X = np.array(X)
    y = labels[valid_indices]
    genders = gender_labels[valid_indices]

    X_train, X_test, y_train, y_test, genders_train, genders_test = train_test_split(
        X, y, genders, test_size=0.2, random_state=42, stratify=y
    )

    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(f"Overall accuracy: {accuracy_score(y_test, y_pred):.4f}")

    male_indices = (genders_test == 'male')
    female_indices = (genders_test == 'female')

    male_acc = accuracy_score(y_test[male_indices], y_pred[male_indices])
    female_acc = accuracy_score(y_test[female_indices], y_pred[female_indices])

    print(f"Male accuracy: {male_acc:.4f}")
    print(f"Female accuracy: {female_acc:.4f}")

    # analyze misclassifications by sentiment and gender
    male_positive_indices = np.logical_and(male_indices, y_test == 1)
    male_negative_indices = np.logical_and(male_indices, y_test == 0)
    female_positive_indices = np.logical_and(female_indices, y_test == 1)
    female_negative_indices = np.logical_and(female_indices, y_test == 0)

    male_positive_acc = accuracy_score(y_test[male_positive_indices], y_pred[male_positive_indices])
    male_negative_acc = accuracy_score(y_test[male_negative_indices], y_pred[male_negative_indices])
    female_positive_acc = accuracy_score(y_test[female_positive_indices], y_pred[female_positive_indices])
    female_negative_acc = accuracy_score(y_test[female_negative_indices], y_pred[female_negative_indices])

    print(f"Male positive accuracy: {male_positive_acc:.4f}")
    print(f"Male negative accuracy: {male_negative_acc:.4f}")
    print(f"Female positive accuracy: {female_positive_acc:.4f}")
    print(f"Female negative accuracy: {female_negative_acc:.4f}")

    plt.figure(figsize=(10, 6))
    categories = ['Overall', 'Male', 'Female', 'Male+', 'Male-', 'Female+', 'Female-']
    accuracies = [
        accuracy_score(y_test, y_pred),
        male_acc,
        female_acc,
        male_positive_acc,
        male_negative_acc,
        female_positive_acc,
        female_negative_acc
    ]

    plt.bar(categories, accuracies, color='skyblue')
    plt.axhline(y=0.5, color='r', linestyle='--', label='Random Baseline')
    plt.ylim(0.4, 1.0)
    plt.title(f'Sentiment Classification Accuracy by Gender using {name}')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'sentiment_bias_{name}.png')
    plt.close()
    return accuracy_score(y_test, y_pred), male_acc, female_acc

if __name__=="__main__":
  evaluate_sentiment_bias(word2vec, 'Word2vec')
  evaluate_sentiment_bias(glove, 'GLoVe')


Evaluating sentiment analysis bias with Word2vec:
Overall accuracy: 0.4545
Male accuracy: 0.4430
Female accuracy: 0.4663
Male positive accuracy: 0.5533
Male negative accuracy: 0.3373
Female positive accuracy: 0.3413
Female negative accuracy: 0.5965

Evaluating sentiment analysis bias with GLoVe:
Overall accuracy: 0.4680
Male accuracy: 0.4743
Female accuracy: 0.4610
Male positive accuracy: 0.3782
Male negative accuracy: 0.5718
Female positive accuracy: 0.5464
Female negative accuracy: 0.3772


### Racial Bias

In [None]:
def get_racial_sentiment_dataset():
    templates = [
        "The {race} perspective on {topic} leans {sentiment}.",
        "{race} individuals found the {topic} somewhat {sentiment}.",
        "The {topic} was met with {sentiment} views by {race} communities.",
        "{race} responses to the {topic} had {sentiment} undertones.",
        "The {topic}'s reception among {race} groups seemed {sentiment}."
    ]

    white_terms = ["white", "caucasian", "european", "western"]
    black_terms = ["black", "african", "african-american"]
    asian_terms = ["asian", "east asian", "southeast asian"]
    latino_terms = ["latino", "latina", "hispanic", "latinx"]

    positive_terms = ["happy", "pleased", "satisfied", "delighted", "glad", "excited", "content", "optimistic"]
    negative_terms = ["sad", "displeased", "dissatisfied", "unhappy", "disappointed", "upset", "distressed", "pessimistic"]

    topics = ["event", "meeting", "discussion", "interview", "application", "interaction", "conversation", "experience"]

    data = []

    racial_groups = [
        (white_terms, "white"),
        (black_terms, "black"),
        (asian_terms, "asian"),
        (latino_terms, "latino")
    ]

    for template in templates:
        for race_terms, race_label in racial_groups:
            for race in race_terms:
                for positive in positive_terms:
                    for topic in topics:
                        sentence = template.format(race=race, sentiment=positive, topic=topic)
                        data.append((sentence, 1, race_label))
                for negative in negative_terms:
                    for topic in topics:
                        sentence = template.format(race=race, sentiment=negative, topic=topic)
                        data.append((sentence, 0, race_label))

    random.shuffle(data)

    sentences, labels, race_labels = zip(*data)

    return sentences, np.array(labels), np.array(race_labels)

In [None]:
def evaluate_racial_sentiment_bias(embeddings, name):
    print(f"\nEvaluating racial sentiment bias with {name}:")

    sentences, labels, race_labels = get_racial_sentiment_dataset()

    X = []
    valid_indices = []

    for i, sentence in enumerate(sentences):
        embedding = sentence_to_embedding(sentence, embeddings)
        if embedding is not None:
            X.append(embedding)
            valid_indices.append(i)

    if not X:
        print(f"No valid embeddings found for sentences using {name}")
        return

    X = np.array(X)
    y = labels[valid_indices]
    races = race_labels[valid_indices]

    X_train, X_test, y_train, y_test, races_train, races_test = train_test_split(
        X, y, races, test_size=0.2, random_state=42, stratify=y
    )

    clf = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(f"Overall accuracy: {accuracy_score(y_test, y_pred):.4f}")

    unique_races = np.unique(races_test)
    race_accuracies = {}

    for race in unique_races:
        race_indices = (races_test == race)
        race_acc = accuracy_score(y_test[race_indices], y_pred[race_indices])
        race_accuracies[race] = race_acc
        print(f"{race.capitalize()} accuracy: {race_acc:.4f}")

    results = []
    for race in unique_races:
        race_indices = (races_test == race)

        race_positive_indices = np.logical_and(race_indices, y_test == 1)
        race_negative_indices = np.logical_and(race_indices, y_test == 0)

        if sum(race_positive_indices) > 0:
            pos_acc = accuracy_score(y_test[race_positive_indices], y_pred[race_positive_indices])
            print(f"{race.capitalize()} positive accuracy: {pos_acc:.4f}")
            results.append({
                'race': race,
                'sentiment': 'positive',
                'accuracy': pos_acc
            })

        if sum(race_negative_indices) > 0:
            neg_acc = accuracy_score(y_test[race_negative_indices], y_pred[race_negative_indices])
            print(f"{race.capitalize()} negative accuracy: {neg_acc:.4f}")
            results.append({
                'race': race,
                'sentiment': 'negative',
                'accuracy': neg_acc
            })

    df = pd.DataFrame(results)
    pivot_df = df.pivot(index='race', columns='sentiment', values='accuracy')

    plt.figure(figsize=(10, 6))
    ax = sns.heatmap(pivot_df, annot=True, cmap='Blues', vmin=0.5, vmax=1.0)
    plt.title(f'Sentiment Classification Accuracy by Race using {name}')
    plt.tight_layout()
    plt.savefig(f'racial_sentiment_bias_{name}.png')
    plt.close()

    max_diff = max(race_accuracies.values()) - min(race_accuracies.values())
    print(f"Maximum accuracy difference between racial groups: {max_diff:.4f}")

    return race_accuracies, results

In [None]:
if __name__=="__main__":
  evaluate_racial_sentiment_bias(word2vec, 'Word2vec')
  evaluate_racial_sentiment_bias(glove, 'GLoVe')


Evaluating racial sentiment bias with Word2vec:
Overall accuracy: 0.6842
Asian accuracy: 0.6817
Black accuracy: 0.6797
Latino accuracy: 0.6913
White accuracy: 0.6813
Asian positive accuracy: 0.5051
Asian negative accuracy: 0.8729
Black positive accuracy: 0.6576
Black negative accuracy: 0.7000
Latino positive accuracy: 0.7681
Latino negative accuracy: 0.6151
White positive accuracy: 0.6792
White negative accuracy: 0.6835
Maximum accuracy difference between racial groups: 0.0116

Evaluating racial sentiment bias with GLoVe:
Overall accuracy: 0.6931
Asian accuracy: 0.7047
Black accuracy: 0.7107
Latino accuracy: 0.6863
White accuracy: 0.6804
Asian positive accuracy: 0.7072
Asian negative accuracy: 0.7022
Black positive accuracy: 0.6328
Black negative accuracy: 0.7849
Latino positive accuracy: 0.6466
Latino negative accuracy: 0.7295
White positive accuracy: 0.6875
White negative accuracy: 0.6736
Maximum accuracy difference between racial groups: 0.0304


# References

1. Pennington, J., Socher, R., & Manning, C. D. (2014, October). Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) (pp. 1532-1543).

2. Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781.

3. Caliskan, A., Bryson, J. J., & Narayanan, A. (2017). Semantics derived automatically from language corpora contain human-like biases. Science, 356(6334), 183-186.

4. Text Embedding Models Contain Bias. Here's Why That Matters. https://developers.googleblog.com/en/text-embedding-models-contain-bias-heres-why-that-matters/

5. chadaeun/weat_replication. https://github.com/chadaeun/weat_replication

6. PLN-FaMAF/Bias-in-word-embeddings. https://github.com/PLN-FaMAF/Bias-in-word-embeddings

7. Introduction to t-SNE. https://www.datacamp.com/tutorial/introduction-t-sne

8. Word Embedding Fairness Evaluation. https://www.kdnuggets.com/2020/08/word-embedding-fairness-evaluation.html

9. Bolukbasi, T., Chang, K. W., Zou, J. Y., Saligrama, V., & Kalai, A. T. (2016). Man is to computer programmer as woman is to homemaker? debiasing word embeddings. Advances in neural information processing systems, 29.

10. Garg, N., Schiebinger, L., Jurafsky, D., & Zou, J. (2018). Word embeddings quantify 100 years of gender and ethnic stereotypes. Proceedings of the National Academy of Sciences, 115(16), E3635-E3644.

11. Manzini, T., Lim, Y. C., Tsvetkov, Y., & Black, A. W. (2019). Black is to criminal as caucasian is to police: Detecting and removing multiclass bias in word embeddings. arXiv preprint arXiv:1904.04047.

