In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Setup

In [5]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

# Tokenize words and get embeddings
def get_word_embeddings(word_list):
    tokens = tokenizer(word_list, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    return embeddings

# Word Sets and Embeddings
AF_Names = ["Reginald", "Kameron", "Kendrick", "Javon", "Tyrell", "Jamar", "Camron", "Tyree", "Jamari", "Reggie", "Jada", 
            "Latoya", "Jayla", "Tamika", "Latoyna", "Journey", "Tameka", "Journee", "Lawanda", "Janiya"]
AF_Embeddings = get_word_embeddings(AF_Names)

EU_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", "Mary", 
            "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Sarah", "Jessica"]
EU_Embeddings = get_word_embeddings(EU_Names)

LX_Names = ["Paul", "Vincent", "Victor", "Adrian", "Marcus", "Leo", "Miles", "Roman", "Sergio", "Felix", "Patricia", "Laura", 
            "Amanda", "Victoria", "Julia", "Gloria", "Diana", "Clara", "Paula", "Norma"]
LX_Embeddings = get_word_embeddings(LX_Names)

CH_Names = ["Lian", "Shan", "Lew", "Long", "Quan", "Jun", "Tou", "Jin", "Cai", "Chan", "Lue", "China", "Lu", "Maylee", 
            "Tennie", "Maylin", "Chynna", "Jia", "Mei", "Tylee"]
CH_Embeddings = get_word_embeddings(CH_Names)

Male_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", 
              "Christopher", "Daniel", "Matthew","George", "Anthony", "Donald", "Paul", "Mark", "Andrew", "Edward"]
Male_Embeddings = get_word_embeddings(Male_Names)

Female_Names = ["Mary", "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Dorothy", "Sarah", 
                "Jessica", "Helen", "Nancy", "Betty", "Karen", "Lisa", "Anna", "Sandra", "Emily", "Ashley"]
Female_Embeddings = get_word_embeddings(Female_Names)

Pleasant_Words = ["happy", "agreeable", "polite", "civil", "charming", "gracious", "gentle", "approachable", "love", "cool"]
Pleasant_Embeddings = get_word_embeddings(Pleasant_Words)

Unpleasant_Words = ["rude", "lazy", "disagreeable", "lousy", "sad", "hate", "violent", "bitter", "harsh", "angry"]
Unpleasant_Embeddings = get_word_embeddings(Unpleasant_Words)

STEM_Careers = ["Software Developer", "Nurse Practitioner", "Health Services Manager", "Physicians Assistant", 
                "Security Analyst", "IT Manager", "Web Developer", "Dentist", "Orthodontist", "Computer Systems Analyst"]
STEM_Embeddings = get_word_embeddings(STEM_Careers)

Non_STEM_Careers = ["Artist", "Marketing Manager", "Social Worker", "Attorney", "Journalist", "Musician", "Teacher", 
                    "Media Manager", "Graphic Designer", "Judge"]
Non_STEM_Embeddings = get_word_embeddings(Non_STEM_Careers)

print("DONE!")

  from .autonotebook import tqdm as notebook_tqdm


DONE!


# TEST 1: Racial Biases

In [6]:
# African American Names
# Pleasant Words
similarities_AFvP = cosine_similarity(AF_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_AFvU = cosine_similarity(AF_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: African American Names vs Pleasant Words")
print(similarities_AFvP)
print("Cosine Similarity Matrix: African American Names vs Unpleasant Words")
print(similarities_AFvU)

Cosine Similarity Matrix: African American Names vs Pleasant Words
[[0.8134762  0.85982513 0.8563899  0.8315817  0.8841947  0.871416
  0.88070023 0.8453571  0.8831556  0.88618684]
 [0.74660766 0.8509965  0.85048735 0.7990546  0.8791512  0.8640443
  0.86246574 0.8474316  0.8402779  0.8390809 ]
 [0.7924584  0.85051835 0.84985155 0.8434801  0.8939191  0.90058297
  0.8849774  0.845729   0.9123274  0.9136672 ]
 [0.77177906 0.86485314 0.87226915 0.800704   0.8917396  0.88012445
  0.87637305 0.8610756  0.8485935  0.8531549 ]
 [0.7782118  0.87664175 0.88032067 0.8207685  0.9100627  0.90613735
  0.9017594  0.8651554  0.897918   0.89170134]
 [0.7392798  0.8458283  0.8617611  0.79580677 0.8697676  0.8687115
  0.8698578  0.8408196  0.8611048  0.8475254 ]
 [0.7462009  0.8520186  0.8631096  0.80364186 0.8887155  0.865667
  0.87118596 0.85849667 0.8430755  0.83057857]
 [0.77071714 0.88336504 0.9025403  0.8211647  0.90508366 0.9030818
  0.9133194  0.87244093 0.89556664 0.88599044]
 [0.7585109  0.85432

In [7]:
# European American Names
# Pleasant Words
similarities_EUvP = cosine_similarity(EU_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_EUvU = cosine_similarity(EU_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: European American Names vs Pleasant Words")
print(similarities_EUvP)
print("Cosine Similarity Matrix: European American Names vs Unpleasant Words")
print(similarities_EUvU)

Cosine Similarity Matrix: European American Names vs Pleasant Words
[[0.78008866 0.87823427 0.90413445 0.8325355  0.9140416  0.90965647
  0.91631824 0.87236005 0.91113913 0.9044971 ]
 [0.86129594 0.8031054  0.8409316  0.7517073  0.84522915 0.8536236
  0.84691703 0.8065502  0.8390242  0.8408264 ]
 [0.79313576 0.850875   0.8781846  0.8388362  0.8950795  0.8850014
  0.89338964 0.85531294 0.8895473  0.90231293]
 [0.83733547 0.84394693 0.8734568  0.78583306 0.8905132  0.8937583
  0.88495314 0.8423112  0.8812009  0.88185555]
 [0.83185744 0.75680125 0.803638   0.7132309  0.81424874 0.8129848
  0.8160622  0.7652221  0.7868273  0.79572   ]
 [0.79423124 0.87168455 0.9007801  0.83882    0.9054402  0.9020063
  0.9040654  0.8700323  0.90398663 0.9037557 ]
 [0.85579014 0.7732829  0.80921626 0.7613956  0.8237679  0.82344395
  0.8172748  0.7636154  0.81330335 0.8130952 ]
 [0.78951097 0.73873556 0.7690939  0.7119328  0.7940921  0.76844245
  0.78383005 0.75261545 0.76886237 0.765395  ]
 [0.7895665  0.86

In [8]:
# Latin American Names
# Pleasant Words
similarities_LXvP = cosine_similarity(LX_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_LXvU = cosine_similarity(LX_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Latin American Names vs Pleasant Words")
print(similarities_LXvP)
print("Cosine Similarity Matrix: Latin American Names vs Unpleasant Words")
print(similarities_LXvU)

Cosine Similarity Matrix: Latin American Names vs Pleasant Words
[[0.85078734 0.80097556 0.8327857  0.75177276 0.84714925 0.85120475
  0.8417214  0.80543303 0.8357094  0.8333886 ]
 [0.7513669  0.8662106  0.8906306  0.82565296 0.90174115 0.9018651
  0.8995851  0.8527075  0.89776397 0.8870137 ]
 [0.79131716 0.88570285 0.8994376  0.84372663 0.9234686  0.91699904
  0.9114222  0.885553   0.9186387  0.91061854]
 [0.78979033 0.7729111  0.80101025 0.7098502  0.80913234 0.8079092
  0.80260074 0.7754073  0.78347605 0.7942625 ]
 [0.7980977  0.8899151  0.90455276 0.82874054 0.9081893  0.92171097
  0.9071392  0.88497317 0.90692174 0.9006839 ]
 [0.8256417  0.8516485  0.87997186 0.7992805  0.8882265  0.8884722
  0.8860451  0.8489282  0.87666285 0.87588084]
 [0.85339665 0.78994536 0.8107215  0.72201526 0.83634055 0.8443401
  0.8335186  0.7885115  0.82200444 0.8210181 ]
 [0.7802546  0.8776142  0.905973   0.8453485  0.9128933  0.9041271
  0.91173387 0.881117   0.9219688  0.9071083 ]
 [0.7774778  0.75763

In [9]:
# Chinese American Names
# Pleasant Words
similarities_CHvP = cosine_similarity(CH_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_CHvU = cosine_similarity(CH_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Chinese American Names vs Pleasant Words")
print(similarities_CHvP)
print("Cosine Similarity Matrix: Chinese American Names vs Unpleasant Words")
print(similarities_CHvU)

Cosine Similarity Matrix: Chinese American Names vs Pleasant Words
[[0.75303143 0.8275163  0.842675   0.8026942  0.8496729  0.8464596
  0.8537992  0.82595754 0.8399949  0.84420913]
 [0.7739128  0.8174176  0.8241838  0.8131418  0.85167813 0.83332145
  0.8338187  0.8129686  0.84013134 0.8547971 ]
 [0.7915603  0.8537741  0.8664398  0.8556288  0.8761788  0.8656914
  0.8728341  0.84607106 0.88247323 0.88090825]
 [0.80844176 0.87332153 0.8490622  0.86933446 0.8757879  0.87492526
  0.88408524 0.8708894  0.8890867  0.8910142 ]
 [0.71030605 0.6298757  0.6628208  0.6829097  0.6810226  0.68409634
  0.66898626 0.6441484  0.6601359  0.6893813 ]
 [0.6817981  0.63098985 0.6431471  0.6307205  0.6454624  0.643916
  0.6552001  0.6348238  0.64169925 0.663043  ]
 [0.7708735  0.8178625  0.82930946 0.8026773  0.836743   0.81598973
  0.8339206  0.80831313 0.84325016 0.85852516]
 [0.66634965 0.6168954  0.6196513  0.6073481  0.65024865 0.6270895
  0.63716483 0.6082848  0.6389085  0.6679878 ]
 [0.7416495  0.811

# TEST 2: Gender Biases for Favorability

In [10]:
# Male Names
# Pleasant Words
similarities_MvP = cosine_similarity(Male_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_MvU = cosine_similarity(Male_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Male Names vs Pleasant Words")
print(similarities_MvP)
print("Cosine Similarity Matrix: Male Names vs Unpleasant Words")
print(similarities_MvU)

Cosine Similarity Matrix: Male Names vs Pleasant Words
[[0.78008866 0.87823427 0.90413445 0.8325355  0.9140416  0.90965647
  0.91631824 0.87236005 0.91113913 0.9044971 ]
 [0.86129594 0.8031054  0.8409316  0.7517073  0.84522915 0.8536236
  0.84691703 0.8065502  0.8390242  0.8408264 ]
 [0.79313576 0.850875   0.8781846  0.8388362  0.8950795  0.8850014
  0.89338964 0.85531294 0.8895473  0.90231293]
 [0.83733547 0.84394693 0.8734568  0.78583306 0.8905132  0.8937583
  0.88495314 0.8423112  0.8812009  0.88185555]
 [0.83185744 0.75680125 0.803638   0.7132309  0.81424874 0.8129848
  0.8160622  0.7652221  0.7868273  0.79572   ]
 [0.79423124 0.87168455 0.9007801  0.83882    0.9054402  0.9020063
  0.9040654  0.8700323  0.90398663 0.9037557 ]
 [0.85579014 0.7732829  0.80921626 0.7613956  0.8237679  0.82344395
  0.8172748  0.7636154  0.81330335 0.8130952 ]
 [0.78951097 0.73873556 0.7690939  0.7119328  0.7940921  0.76844245
  0.78383005 0.75261545 0.76886237 0.765395  ]
 [0.7895665  0.8687279  0.9032

In [11]:
# Female Names
# Pleasant Words
similarities_FvP = cosine_similarity(Female_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_FvU = cosine_similarity(Female_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Female Names vs Pleasant Words")
print(similarities_FvP)
print("Cosine Similarity Matrix: Female Names vs Unpleasant Words")
print(similarities_FvU)

Cosine Similarity Matrix: Female Names vs Pleasant Words
[[0.77241194 0.8874492  0.91445    0.8182267  0.9154941  0.91819495
  0.9228681  0.8778442  0.9213349  0.8945196 ]
 [0.79140663 0.8789166  0.8888475  0.826098   0.9011874  0.9033481
  0.9023589  0.8661128  0.9001003  0.885003  ]
 [0.82457644 0.8055421  0.8288559  0.7393075  0.8420528  0.8448453
  0.83687687 0.8164454  0.8155703  0.8144788 ]
 [0.7192494  0.8457006  0.8528291  0.7957823  0.8650768  0.861014
  0.86445725 0.8354276  0.85524595 0.86271447]
 [0.80500984 0.8173133  0.83203053 0.7697827  0.853451   0.85556614
  0.84473956 0.81557983 0.85367334 0.85243714]
 [0.7975146  0.8769878  0.9026653  0.84750915 0.9087918  0.9152535
  0.9124298  0.87553    0.9152951  0.9014149 ]
 [0.7930287  0.89375114 0.907877   0.8165986  0.92070556 0.92212105
  0.9201193  0.88373667 0.9158665  0.89431584]
 [0.8320427  0.7782539  0.8055254  0.70001715 0.8218345  0.8206645
  0.8153338  0.7819461  0.80587685 0.7987678 ]
 [0.81689286 0.88477874 0.889

# TEST 3: Gender Biases in Careers

In [12]:
# Male Names
# STEM Careers
similarities_MvS = cosine_similarity(Male_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_MvN = cosine_similarity(Male_Embeddings, Non_STEM_Embeddings)

print("Cosine Similarity Matrix: Male Names vs STEM Careers")
print(similarities_MvS)
print("Cosine Similarity Matrix: Male Names vs Non-STEM Careers")
print(similarities_MvN)

Cosine Similarity Matrix: Male Names vs STEM Careers
[[0.8203724  0.65222776 0.73349035 0.7540183  0.70988345 0.78903675
  0.80439985 0.8459546  0.64623046 0.6101084 ]
 [0.78375304 0.6344947  0.69291854 0.70175177 0.66869247 0.7485939
  0.75889707 0.8257787  0.61733896 0.57952875]
 [0.82485294 0.6720514  0.74535525 0.7581947  0.7480014  0.8091172
  0.8081469  0.85375714 0.6433012  0.6439165 ]
 [0.8139354  0.6480269  0.7262765  0.7462451  0.709648   0.7838341
  0.79381096 0.8349706  0.63510686 0.61739373]
 [0.74355227 0.6087141  0.6560384  0.67528045 0.626903   0.7027526
  0.7227942  0.79092395 0.5833207  0.5363296 ]
 [0.8364801  0.67725873 0.75423527 0.7690805  0.743348   0.80641985
  0.8179033  0.85703814 0.66062284 0.6438209 ]
 [0.7756483  0.6390547  0.6933552  0.7104774  0.65685785 0.7423656
  0.7580725  0.8133435  0.61338764 0.55500376]
 [0.730317   0.5952121  0.6385132  0.6614349  0.6186557  0.6931881
  0.71528256 0.75417256 0.55270267 0.5324599 ]
 [0.81927466 0.67270994 0.7434767

In [13]:
# Female Names
# STEM Careers
similarities_FvS = cosine_similarity(Female_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_FvN = cosine_similarity(Female_Embeddings, Non_STEM_Embeddings)

print("Cosine Similarity Matrix: Female Names vs STEM Careers")
print(similarities_FvS)
print("Cosine Similarity Matrix: Female Names vs Non-STEM Careers")
print(similarities_FvN)

Cosine Similarity Matrix: Female Names vs STEM Careers
[[0.8142064  0.67873406 0.7431177  0.7659333  0.7101369  0.7887654
  0.7918061  0.8525562  0.64951897 0.62119067]
 [0.8198781  0.680997   0.7561102  0.7554907  0.72098744 0.7979771
  0.7984233  0.8466257  0.6629082  0.642285  ]
 [0.7627859  0.64816725 0.6868555  0.7091337  0.6490469  0.731277
  0.73986286 0.8061701  0.6113295  0.564561  ]
 [0.7920973  0.6435542  0.7321856  0.723623   0.70502484 0.7740035
  0.77995086 0.8009208  0.65776885 0.62029326]
 [0.7800318  0.6592653  0.70163846 0.71885717 0.6819724  0.7412895
  0.7555852  0.80565065 0.6040303  0.60939497]
 [0.8222853  0.67984915 0.75426275 0.77365875 0.73737645 0.8021358
  0.80088866 0.8555753  0.6579007  0.6370573 ]
 [0.8233293  0.68512887 0.75724876 0.7777984  0.7241913  0.80687904
  0.8041497  0.86120164 0.6506966  0.63184214]
 [0.75863683 0.6340595  0.6730537  0.6863864  0.6515636  0.71179163
  0.72880924 0.7921731  0.5951297  0.5703169 ]
 [0.81777155 0.67582524 0.736412

In [15]:
import pandas as pd

#Mean cosine similarity of each test

dataframes_dict = {
    'AFvP': similarities_AFvP,
    'AFvU': similarities_AFvU,
    'EUvP': similarities_EUvP,
    'EUvU': similarities_EUvU,
    'LXvP': similarities_LXvP,
    'LXvU': similarities_LXvU,
    'CHvP': similarities_CHvP,
    'CHvU': similarities_CHvU,
    'MvP': similarities_MvP,
    'MvU': similarities_MvU,
    'FvP': similarities_FvP,
    'FvU': similarities_FvU,
    'MvS': similarities_MvS,
    'MvN': similarities_MvN,
    'FvS': similarities_FvS,
    'FvN': similarities_FvN
}

# Create a dictionary to store the means
mean_dict = {}

# Calculate the mean for each DataFrame and store it in the mean_dict
for df_name, df in dataframes_dict.items():
    df = pd.DataFrame(df)
    mean_value = df.values.mean()
    mean_dict[df_name] = mean_value

# Create a new DataFrame from the mean_dict
mean_df = pd.DataFrame(list(mean_dict.items()), columns=['DataFrame', 'avgCS_BERT_base_multilingual_uncased'])

# Print the new DataFrame
print(mean_df)

#Save to .csv
mean_df.to_csv('BERT_base_multilingual_uncased_meanCosSim.csv', index = False)

   DataFrame  avgCS_BERT_base_multilingual_uncased
0       AFvP                              0.857088
1       AFvU                              0.752056
2       EUvP                              0.847191
3       EUvU                              0.730802
4       LXvP                              0.848878
5       LXvU                              0.727940
6       CHvP                              0.796481
7       CHvU                              0.734306
8        MvP                              0.831929
9        MvU                              0.720304
10       FvP                              0.863760
11       FvU                              0.737971
12       MvS                              0.707461
13       MvN                              0.812305
14       FvS                              0.731626
15       FvN                              0.836586
