In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


# Setup

In [5]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize words and get embeddings
def get_word_embeddings(word_list):
    tokens = tokenizer(word_list, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    return embeddings

# Word Sets and Embeddings
AF_Names = ["Reginald", "Kameron", "Kendrick", "Javon", "Tyrell", "Jamar", "Camron", "Tyree", "Jamari", "Reggie", "Jada", 
            "Latoya", "Jayla", "Tamika", "Latoyna", "Journey", "Tameka", "Journee", "Lawanda", "Janiya"]
AF_Embeddings = get_word_embeddings(AF_Names)

EU_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", "Mary", 
            "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Sarah", "Jessica"]
EU_Embeddings = get_word_embeddings(EU_Names)

LX_Names = ["Paul", "Vincent", "Victor", "Adrian", "Marcus", "Leo", "Miles", "Roman", "Sergio", "Felix", "Patricia", "Laura", 
            "Amanda", "Victoria", "Julia", "Gloria", "Diana", "Clara", "Paula", "Norma"]
LX_Embeddings = get_word_embeddings(LX_Names)

CH_Names = ["Lian", "Shan", "Lew", "Long", "Quan", "Jun", "Tou", "Jin", "Cai", "Chan", "Lue", "China", "Lu", "Maylee", 
            "Tennie", "Maylin", "Chynna", "Jia", "Mei", "Tylee"]
CH_Embeddings = get_word_embeddings(CH_Names)

Male_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", 
              "Christopher", "Daniel", "Matthew","George", "Anthony", "Donald", "Paul", "Mark", "Andrew", "Edward"]
Male_Embeddings = get_word_embeddings(Male_Names)

Female_Names = ["Mary", "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Dorothy", "Sarah", 
                "Jessica", "Helen", "Nancy", "Betty", "Karen", "Lisa", "Anna", "Sandra", "Emily", "Ashley"]
Female_Embeddings = get_word_embeddings(Female_Names)

Pleasant_Words = ["happy", "agreeable", "polite", "civil", "charming", "gracious", "gentle", "approachable", "love", "cool"]
Pleasant_Embeddings = get_word_embeddings(Pleasant_Words)

Unpleasant_Words = ["rude", "lazy", "disagreeable", "lousy", "sad", "hate", "violent", "bitter", "harsh", "angry"]
Unpleasant_Embeddings = get_word_embeddings(Unpleasant_Words)

STEM_Careers = ["Software Developer", "Nurse Practitioner", "Health Services Manager", "Physicians Assistant", 
                "Security Analyst", "IT Manager", "Web Developer", "Dentist", "Orthodontist", "Computer Systems Analyst"]
STEM_Embeddings = get_word_embeddings(STEM_Careers)

Non_STEM_Careers = ["Artist", "Marketing Manager", "Social Worker", "Attorney", "Journalist", "Musician", "Teacher", 
                    "Media Manager", "Graphic Designer", "Judge"]
Non_STEM_Embeddings = get_word_embeddings(Non_STEM_Careers)

print("DONE!")

DONE!


# TEST 1: Racial Biases

In [7]:
# African American Names
# Pleasant Words
similarities_AFvP = cosine_similarity(AF_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_AFvU = cosine_similarity(AF_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: African American Names vs Pleasant Words")
print(similarities_AFvP)
print("Cosine Similarity Matrix: African American Names vs Unpleasant Words")
print(similarities_AFvU)

Cosine Similarity Matrix: African American Names vs Pleasant Words
[[0.73828185 0.6498773  0.77740085 0.56378174 0.83819747 0.5324205
  0.79903626 0.69827986 0.6550108  0.7486142 ]
 [0.60467076 0.5630037  0.6241868  0.48426557 0.66570246 0.5179579
  0.6517494  0.61952484 0.5845643  0.6047308 ]
 [0.62041605 0.55904174 0.6582182  0.5574348  0.68914247 0.47029594
  0.6872313  0.6025049  0.5953624  0.65218925]
 [0.5684547  0.541189   0.56418765 0.40959305 0.6435384  0.515993
  0.59437394 0.5424759  0.54156303 0.587583  ]
 [0.5905937  0.52627885 0.59866625 0.5260875  0.6260965  0.49406964
  0.6569525  0.5384466  0.607139   0.55964005]
 [0.717938   0.62320375 0.6966279  0.5558896  0.74571633 0.51893246
  0.73763436 0.71930754 0.62937284 0.7025498 ]
 [0.6571582  0.58970666 0.662114   0.5441679  0.6871207  0.48501593
  0.6966513  0.6733535  0.6291841  0.67300844]
 [0.64262927 0.6105879  0.6365538  0.4759775  0.7099918  0.5164905
  0.6805997  0.64701724 0.6363895  0.64249545]
 [0.7002426  0.603

In [8]:
# European American Names
# Pleasant Words
similarities_EUvP = cosine_similarity(EU_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_EUvU = cosine_similarity(EU_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: European American Names vs Pleasant Words")
print(similarities_EUvP)
print("Cosine Similarity Matrix: European American Names vs Unpleasant Words")
print(similarities_EUvU)

Cosine Similarity Matrix: African American Names vs Pleasant Words
[[0.74580085 0.6609976  0.74456453 0.6156603  0.8057695  0.5967151
  0.7959496  0.73821795 0.7111749  0.750686  ]
 [0.70138204 0.6602277  0.7152402  0.6560726  0.7546242  0.5786077
  0.7563137  0.72542334 0.6728202  0.7206048 ]
 [0.7362276  0.6616829  0.73306036 0.6239346  0.7922344  0.5771165
  0.7937003  0.7659415  0.68994963 0.7411611 ]
 [0.7780455  0.66512096 0.7579025  0.6219342  0.83038586 0.5914754
  0.81679666 0.7539427  0.7264643  0.7821056 ]
 [0.71586764 0.6706762  0.7363739  0.6188588  0.78880394 0.5843812
  0.7907635  0.7488339  0.66702306 0.74834764]
 [0.7484337  0.64090985 0.7217906  0.5965635  0.78893006 0.5764973
  0.7828473  0.727506   0.6986522  0.7405306 ]
 [0.7395806  0.67184633 0.71936154 0.5929353  0.78631806 0.5822444
  0.7869751  0.7497704  0.69610286 0.7300803 ]
 [0.7305021  0.6643358  0.7256195  0.6302404  0.7797779  0.58798885
  0.77405906 0.7413183  0.68346655 0.72389865]
 [0.71784854 0.69163

In [9]:
# Latin American Names
# Pleasant Words
similarities_LXvP = cosine_similarity(LX_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_LXvU = cosine_similarity(LX_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Latin American Names vs Pleasant Words")
print(similarities_LXvP)
print("Cosine Similarity Matrix: Latin American Names vs Unpleasant Words")
print(similarities_LXvU)

Cosine Similarity Matrix: Latin American Names vs Pleasant Words
[[0.74855554 0.67013717 0.7437303  0.61428493 0.8084895  0.5795258
  0.79746354 0.7440177  0.7015193  0.7578967 ]
 [0.76347876 0.67851734 0.7412702  0.61482054 0.7998061  0.5799122
  0.79411125 0.7545784  0.7003759  0.7488786 ]
 [0.77257687 0.66979694 0.73305035 0.61440754 0.8213943  0.604414
  0.8054178  0.73590887 0.72348964 0.77316713]
 [0.7546182  0.66147506 0.7445504  0.58676565 0.82679653 0.58245766
  0.7794368  0.7132256  0.71700424 0.76241255]
 [0.75325733 0.6423924  0.71977603 0.56766284 0.8057996  0.58367413
  0.7714841  0.67203754 0.7172682  0.7711266 ]
 [0.7645882  0.68026423 0.7384871  0.6159693  0.8067279  0.59211147
  0.8049896  0.72346    0.7119449  0.7907958 ]
 [0.75261074 0.64518756 0.7199197  0.55246997 0.81144726 0.5809467
  0.801537   0.68979645 0.74187464 0.7318626 ]
 [0.69463205 0.63651997 0.72621167 0.67619765 0.7088342  0.5769968
  0.73362505 0.70087147 0.66549563 0.712149  ]
 [0.7578424  0.664253

In [10]:
# Chinese American Names
# Pleasant Words
similarities_CHvP = cosine_similarity(CH_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_CHvU = cosine_similarity(CH_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Chinese American Names vs Pleasant Words")
print(similarities_CHvP)
print("Cosine Similarity Matrix: Chinese American Names vs Unpleasant Words")
print(similarities_CHvU)

Cosine Similarity Matrix: Chinese American Names vs Pleasant Words
[[0.63730204 0.59303343 0.62599146 0.4838103  0.6702348  0.47450155
  0.66320425 0.6464698  0.6142039  0.6324545 ]
 [0.7320734  0.61709934 0.7530494  0.5638628  0.80033046 0.49204218
  0.7844047  0.7026711  0.65014386 0.74747777]
 [0.6926079  0.60817856 0.7311553  0.6185907  0.74415433 0.4834689
  0.75052863 0.70138234 0.6329361  0.7092832 ]
 [0.67784655 0.6191275  0.73205787 0.53167075 0.7343601  0.46922243
  0.7602862  0.65020025 0.6511672  0.70011884]
 [0.67581415 0.60302657 0.69805115 0.583944   0.706868   0.43450767
  0.704077   0.683108   0.5921906  0.6683913 ]
 [0.6022024  0.5230173  0.6220128  0.5090271  0.6099464  0.38654137
  0.6215056  0.59886265 0.5181625  0.59255064]
 [0.67575645 0.61060905 0.6448298  0.48201728 0.6792851  0.50792384
  0.6894265  0.65291214 0.6227243  0.6739402 ]
 [0.702687   0.62196463 0.6887698  0.52675337 0.74233913 0.47162902
  0.7234256  0.6837928  0.6042081  0.70272595]
 [0.6672002  0

# TEST 2: Gender Biases for Favorability

In [11]:
# Male Names
# Pleasant Words
similarities_MvP = cosine_similarity(Male_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_MvU = cosine_similarity(Male_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Male Names vs Pleasant Words")
print(similarities_MvP)
print("Cosine Similarity Matrix: Male Names vs Unpleasant Words")
print(similarities_MvU)

Cosine Similarity Matrix: Male Names vs Pleasant Words
[[0.74580085 0.6609976  0.74456453 0.6156603  0.8057695  0.5967151
  0.7959496  0.73821795 0.7111749  0.750686  ]
 [0.70138204 0.6602277  0.7152402  0.6560726  0.7546242  0.5786077
  0.7563137  0.72542334 0.6728202  0.7206048 ]
 [0.7362276  0.6616829  0.73306036 0.6239346  0.7922344  0.5771165
  0.7937003  0.7659415  0.68994963 0.7411611 ]
 [0.7780455  0.66512096 0.7579025  0.6219342  0.83038586 0.5914754
  0.81679666 0.7539427  0.7264643  0.7821056 ]
 [0.71586764 0.6706762  0.7363739  0.6188588  0.78880394 0.5843812
  0.7907635  0.7488339  0.66702306 0.74834764]
 [0.7484337  0.64090985 0.7217906  0.5965635  0.78893006 0.5764973
  0.7828473  0.727506   0.6986522  0.7405306 ]
 [0.7395806  0.67184633 0.71936154 0.5929353  0.78631806 0.5822444
  0.7869751  0.7497704  0.69610286 0.7300803 ]
 [0.7305021  0.6643358  0.7256195  0.6302404  0.7797779  0.58798885
  0.77405906 0.7413183  0.68346655 0.72389865]
 [0.71784854 0.6916313  0.729506

In [12]:
# Female Names
# Pleasant Words
similarities_FvP = cosine_similarity(Female_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_FvU = cosine_similarity(Female_Embeddings, Unpleasant_Embeddings)

print("Cosine Similarity Matrix: Female Names vs Pleasant Words")
print(similarities_FvP)
print("Cosine Similarity Matrix: Female Names vs Unpleasant Words")
print(similarities_FvU)

Cosine Similarity Matrix: Female Names vs Pleasant Words
[[0.7614287  0.65481484 0.7485459  0.5978867  0.8185439  0.63829136
  0.8257448  0.7245882  0.73378265 0.7650913 ]
 [0.70366883 0.6498134  0.7146908  0.6291242  0.75018305 0.5841105
  0.756728   0.71326447 0.67092454 0.69976836]
 [0.74557704 0.6875371  0.76333904 0.6406418  0.8002698  0.60319656
  0.7906362  0.74387944 0.6914885  0.7555088 ]
 [0.7815652  0.6408837  0.7475347  0.5784642  0.8315696  0.59144473
  0.81413877 0.7086358  0.73216426 0.76533616]
 [0.7567066  0.6884959  0.74646705 0.63881534 0.80432916 0.58398366
  0.8037608  0.7301064  0.6930585  0.77203137]
 [0.7137082  0.65761435 0.71379447 0.63416326 0.7505009  0.57501817
  0.75892884 0.7047142  0.66941583 0.7237598 ]
 [0.6498726  0.65214896 0.67963356 0.643849   0.6775583  0.56529164
  0.7007655  0.67306894 0.6380273  0.65132153]
 [0.7744173  0.6617323  0.7549901  0.60286295 0.81701076 0.58784616
  0.80547357 0.7173873  0.7377645  0.761575  ]
 [0.72747874 0.654933   

# TEST 3: Gender Biases in Careers

In [13]:
# Male Names
# STEM Careers
similarities_MvS = cosine_similarity(Male_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_MvN = cosine_similarity(Male_Embeddings, Non_STEM_Embeddings)

print("Cosine Similarity Matrix: Male Names vs STEM Careers")
print(similarities_MvS)
print("Cosine Similarity Matrix: Male Names vs Non-STEM Careers")
print(similarities_MvN)

Cosine Similarity Matrix: Male Names vs STEM Careers
[[0.5994292  0.5648885  0.54191536 0.63186413 0.5648516  0.6516925
  0.579072   0.6439919  0.30639482 0.56600356]
 [0.5917505  0.5823375  0.55114424 0.6284311  0.55956054 0.64082
  0.56922996 0.650278   0.32794273 0.5658723 ]
 [0.6133235  0.56928694 0.53566605 0.63441014 0.5865805  0.6661085
  0.5915099  0.6619658  0.3239615  0.5937997 ]
 [0.6191546  0.5859114  0.5451805  0.65599793 0.59166837 0.6934093
  0.6094103  0.68234575 0.3215686  0.58802307]
 [0.58866274 0.5694508  0.52683365 0.62972844 0.5609401  0.65483826
  0.56716645 0.6621841  0.2950219  0.5651026 ]
 [0.5768868  0.54021263 0.5486374  0.6201743  0.54660547 0.6483563
  0.55987775 0.62289155 0.3267391  0.5430739 ]
 [0.6371297  0.59304243 0.58299315 0.6674816  0.5848471  0.67807865
  0.5973131  0.6568929  0.34070873 0.6067581 ]
 [0.58299494 0.5487101  0.5451921  0.61899626 0.5538453  0.6388665
  0.556994   0.64520335 0.33530638 0.55551326]
 [0.62424874 0.60642636 0.5416329  

In [14]:
# Female Names
# STEM Careers
similarities_FvS = cosine_similarity(Female_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_FvN = cosine_similarity(Female_Embeddings, Non_STEM_Embeddings)

print("Cosine Similarity Matrix: Female Names vs STEM Careers")
print(similarities_FvS)
print("Cosine Similarity Matrix: Female Names vs Non-STEM Careers")
print(similarities_FvN)

Cosine Similarity Matrix: Female Names vs STEM Careers
[[0.5687543  0.5632052  0.5277553  0.6265735  0.5396215  0.65340567
  0.56013995 0.64687395 0.28949058 0.5382247 ]
 [0.5779153  0.5946368  0.5331037  0.6286562  0.5662621  0.6196699
  0.5626484  0.63650525 0.3450579  0.5620878 ]
 [0.6236322  0.6292224  0.56450033 0.65105754 0.60521054 0.6695456
  0.620904   0.6720136  0.3678279  0.59278905]
 [0.5957081  0.54893494 0.55328    0.6275259  0.5549524  0.65914106
  0.57598215 0.6237037  0.30826992 0.5489621 ]
 [0.6237196  0.62922204 0.55326945 0.6638618  0.61956084 0.6770388
  0.60795915 0.6864457  0.35599393 0.61173236]
 [0.60983104 0.6262046  0.55868673 0.6441394  0.6106314  0.659641
  0.60426885 0.657905   0.37318796 0.60106736]
 [0.57210314 0.61707604 0.52237797 0.6100669  0.57019573 0.6081711
  0.5579047  0.61580515 0.36522424 0.5836196 ]
 [0.59804785 0.5641458  0.5312394  0.63207066 0.56138945 0.6591219
  0.58480644 0.6319491  0.3435758  0.55843467]
 [0.59292877 0.6105448  0.555291