In [32]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [33]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [35]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


# Setup

In [36]:
import numpy as np
import pandas as pd
import torch
import os
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize words and get embeddings
def get_word_embeddings(word_list):
    tokens = tokenizer(word_list, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling over tokens
    return embeddings

# Word Sets and Embeddings
AF_Names = ["Reginald", "Kameron", "Kendrick", "Javon", "Tyrell", "Jamar", "Camron", "Tyree", "Jamari", "Reggie", "Jada", 
            "Latoya", "Jayla", "Tamika", "Latoyna", "Journey", "Tameka", "Journee", "Lawanda", "Janiya"]
AF_Embeddings = get_word_embeddings(AF_Names)

EU_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", "Mary", 
            "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Sarah", "Jessica"]
EU_Embeddings = get_word_embeddings(EU_Names)

LX_Names = ["Paul", "Vincent", "Victor", "Adrian", "Marcus", "Leo", "Miles", "Roman", "Sergio", "Felix", "Patricia", "Laura", 
            "Amanda", "Victoria", "Julia", "Gloria", "Diana", "Clara", "Paula", "Norma"]
LX_Embeddings = get_word_embeddings(LX_Names)

CH_Names = ["Lian", "Shan", "Lew", "Long", "Quan", "Jun", "Tou", "Jin", "Cai", "Chan", "Lue", "China", "Lu", "Maylee", 
            "Tennie", "Maylin", "Chynna", "Jia", "Mei", "Tylee"]
CH_Embeddings = get_word_embeddings(CH_Names)

Male_Names = ["James", "John", "Robert", "Michael", "William", "David", "Joseph", "Richard", "Charles", "Thomas", 
              "Christopher", "Daniel", "Matthew","George", "Anthony", "Donald", "Paul", "Mark", "Andrew", "Edward"]
Male_Embeddings = get_word_embeddings(Male_Names)

Female_Names = ["Mary", "Elizabeth", "Patricia", "Jennifer", "Linda", "Barbara", "Margaret", "Susan", "Dorothy", "Sarah", 
                "Jessica", "Helen", "Nancy", "Betty", "Karen", "Lisa", "Anna", "Sandra", "Emily", "Ashley"]
Female_Embeddings = get_word_embeddings(Female_Names)

Pleasant_Words = ["happy", "agreeable", "polite", "civil", "charming", "gracious", "gentle", "approachable", "love", "cool"]
Pleasant_Embeddings = get_word_embeddings(Pleasant_Words)

Unpleasant_Words = ["rude", "lazy", "disagreeable", "lousy", "sad", "hate", "violent", "bitter", "harsh", "angry"]
Unpleasant_Embeddings = get_word_embeddings(Unpleasant_Words)

STEM_Careers = ["Software Developer", "Nurse Practitioner", "Health Services Manager", "Physicians Assistant", 
                "Security Analyst", "IT Manager", "Web Developer", "Dentist", "Orthodontist", "Computer Systems Analyst"]
STEM_Embeddings = get_word_embeddings(STEM_Careers)

Non_STEM_Careers = ["Artist", "Marketing Manager", "Social Worker", "Attorney", "Journalist", "Musician", "Teacher", 
                    "Media Manager", "Graphic Designer", "Judge"]
Non_STEM_Embeddings = get_word_embeddings(Non_STEM_Careers)

# TEST 1: Racial Biases

In [37]:
# African American Names
# Pleasant Words
similarities_AFvP = cosine_similarity(AF_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_AFvU = cosine_similarity(AF_Embeddings, Unpleasant_Embeddings)

similarities_AFvP = pd.DataFrame(similarities_AFvP, index = AF_Names, columns = Pleasant_Words)
similarities_AFvU = pd.DataFrame(similarities_AFvU, index = AF_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: African American Names vs Pleasant Words")
print(similarities_AFvP)
print("Cosine Similarity Matrix: African American Names vs Unpleasant Words")
print(similarities_AFvU)

similarities_AFvP.to_csv("BERT_base_cased_AFvP.csv", index = True)
similarities_AFvU.to_csv("BERT_base_cased_AFvU.csv", index = True)

Cosine Similarity Matrix: African American Names vs Pleasant Words
             happy  agreeable    polite     civil  charming  gracious  \
Reginald  0.738282   0.649877  0.777401  0.563782  0.838197  0.532421   
Kameron   0.604671   0.563004  0.624187  0.484266  0.665702  0.517958   
Kendrick  0.620416   0.559042  0.658218  0.557435  0.689142  0.470296   
Javon     0.568455   0.541189  0.564188  0.409593  0.643538  0.515993   
Tyrell    0.590594   0.526279  0.598666  0.526088  0.626096  0.494070   
Jamar     0.717938   0.623204  0.696628  0.555890  0.745716  0.518932   
Camron    0.657158   0.589707  0.662114  0.544168  0.687121  0.485016   
Tyree     0.642629   0.610588  0.636554  0.475978  0.709992  0.516491   
Jamari    0.700243   0.603155  0.667514  0.542030  0.721779  0.558601   
Reggie    0.767005   0.629360  0.763898  0.556539  0.816957  0.506053   
Jada      0.608463   0.537162  0.575162  0.446532  0.606839  0.485288   
Latoya    0.448219   0.507939  0.428840  0.360260  0.4555

In [38]:
# European American Names
# Pleasant Words
similarities_EUvP = cosine_similarity(EU_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_EUvU = cosine_similarity(EU_Embeddings, Unpleasant_Embeddings)

similarities_EUvP = pd.DataFrame(similarities_EUvP, index = EU_Names, columns = Pleasant_Words)
similarities_EUvU = pd.DataFrame(similarities_EUvU, index = EU_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: European American Names vs Pleasant Words")
print(similarities_EUvP)
print("Cosine Similarity Matrix: European American Names vs Unpleasant Words")
print(similarities_EUvU)

similarities_EUvP.to_csv('BERT_base_cased_EUvP.csv', index = True)
similarities_EUvU.to_csv('BERT_base_cased_EUvU.csv', index = True)

Cosine Similarity Matrix: European American Names vs Pleasant Words
              happy  agreeable    polite     civil  charming  gracious  \
James      0.745801   0.660998  0.744565  0.615660  0.805770  0.596715   
John       0.701382   0.660228  0.715240  0.656073  0.754624  0.578608   
Robert     0.736228   0.661683  0.733060  0.623935  0.792234  0.577116   
Michael    0.778045   0.665121  0.757903  0.621934  0.830386  0.591475   
William    0.715868   0.670676  0.736374  0.618859  0.788804  0.584381   
David      0.748434   0.640910  0.721791  0.596564  0.788930  0.576497   
Joseph     0.739581   0.671846  0.719362  0.592935  0.786318  0.582244   
Richard    0.730502   0.664336  0.725619  0.630240  0.779778  0.587989   
Charles    0.717849   0.691631  0.729507  0.648721  0.775796  0.581864   
Thomas     0.741931   0.663853  0.741360  0.625551  0.804401  0.576369   
Mary       0.761429   0.654815  0.748546  0.597887  0.818544  0.638291   
Elizabeth  0.703669   0.649813  0.714691  0.

In [39]:
# Latin American Names
# Pleasant Words
similarities_LXvP = cosine_similarity(LX_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_LXvU = cosine_similarity(LX_Embeddings, Unpleasant_Embeddings)

similarities_LXvP = pd.DataFrame(similarities_LXvP, index = LX_Names, columns = Pleasant_Words)
similarities_LXvU = pd.DataFrame(similarities_LXvU, index = LX_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: Latin American Names vs Pleasant Words")
print(similarities_LXvP)
print("Cosine Similarity Matrix: Latin American Names vs Unpleasant Words")
print(similarities_LXvU)

similarities_LXvP.to_csv('BERT_base_cased_LXvP.csv', index = True)
similarities_LXvU.to_csv('BERT_base_cased_LXvU.csv', index = True)

Cosine Similarity Matrix: Latin American Names vs Pleasant Words
             happy  agreeable    polite     civil  charming  gracious  \
Paul      0.748556   0.670137  0.743730  0.614285  0.808490  0.579526   
Vincent   0.763479   0.678517  0.741270  0.614821  0.799806  0.579912   
Victor    0.772577   0.669797  0.733050  0.614408  0.821394  0.604414   
Adrian    0.754618   0.661475  0.744550  0.586766  0.826797  0.582458   
Marcus    0.753257   0.642392  0.719776  0.567663  0.805800  0.583674   
Leo       0.764588   0.680264  0.738487  0.615969  0.806728  0.592111   
Miles     0.752611   0.645188  0.719920  0.552470  0.811447  0.580947   
Roman     0.694632   0.636520  0.726212  0.676198  0.708834  0.576997   
Sergio    0.757842   0.664254  0.719615  0.588998  0.782531  0.570943   
Felix     0.785083   0.664355  0.750477  0.592634  0.846166  0.603942   
Patricia  0.745577   0.687537  0.763339  0.640642  0.800270  0.603197   
Laura     0.788696   0.659055  0.758021  0.595309  0.831445

In [40]:
# Chinese American Names
# Pleasant Words
similarities_CHvP = cosine_similarity(CH_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_CHvU = cosine_similarity(CH_Embeddings, Unpleasant_Embeddings)

similarities_CHvP = pd.DataFrame(similarities_CHvP, index = CH_Names, columns = Pleasant_Words)
similarities_CHvU = pd.DataFrame(similarities_CHvU, index = CH_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: Chinese American Names vs Pleasant Words")
print(similarities_CHvP)
print("Cosine Similarity Matrix: Chinese American Names vs Unpleasant Words")
print(similarities_CHvU)

similarities_CHvP.to_csv('BERT_base_cased_CHvP.csv', index = True)
similarities_CHvU.to_csv('BERT_base_cased_CHvU.csv', index = True)

Cosine Similarity Matrix: Chinese American Names vs Pleasant Words
           happy  agreeable    polite     civil  charming  gracious    gentle  \
Lian    0.637302   0.593033  0.625991  0.483810  0.670235  0.474502  0.663204   
Shan    0.732073   0.617099  0.753049  0.563863  0.800330  0.492042  0.784405   
Lew     0.692608   0.608179  0.731155  0.618591  0.744154  0.483469  0.750529   
Long    0.677847   0.619128  0.732058  0.531671  0.734360  0.469222  0.760286   
Quan    0.675814   0.603027  0.698051  0.583944  0.706868  0.434508  0.704077   
Jun     0.602202   0.523017  0.622013  0.509027  0.609946  0.386541  0.621506   
Tou     0.675756   0.610609  0.644830  0.482017  0.679285  0.507924  0.689426   
Jin     0.702687   0.621965  0.688770  0.526753  0.742339  0.471629  0.723426   
Cai     0.667200   0.608120  0.703372  0.600657  0.704861  0.461391  0.701819   
Chan    0.730271   0.607904  0.729389  0.542921  0.795286  0.477988  0.777493   
Lue     0.624381   0.602226  0.606551  0.4

# TEST 2: Gender Biases for Favorability

In [41]:
# Male Names
# Pleasant Words
similarities_MvP = cosine_similarity(Male_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_MvU = cosine_similarity(Male_Embeddings, Unpleasant_Embeddings)

similarities_MvP = pd.DataFrame(similarities_MvP, index = Male_Names, columns = Pleasant_Words)
similarities_MvU = pd.DataFrame(similarities_MvU, index = Male_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: Male Names vs Pleasant Words")
print(similarities_MvP)
print("Cosine Similarity Matrix: Male Names vs Unpleasant Words")
print(similarities_MvU)

similarities_MvP.to_csv('BERT_base_cased_MvP.csv', index = True)
similarities_MvU.to_csv('BERT_base_cased_MvU.csv', index = True)

Cosine Similarity Matrix: Male Names vs Pleasant Words
                happy  agreeable    polite     civil  charming  gracious  \
James        0.745801   0.660998  0.744565  0.615660  0.805770  0.596715   
John         0.701382   0.660228  0.715240  0.656073  0.754624  0.578608   
Robert       0.736228   0.661683  0.733060  0.623935  0.792234  0.577116   
Michael      0.778045   0.665121  0.757903  0.621934  0.830386  0.591475   
William      0.715868   0.670676  0.736374  0.618859  0.788804  0.584381   
David        0.748434   0.640910  0.721791  0.596564  0.788930  0.576497   
Joseph       0.739581   0.671846  0.719362  0.592935  0.786318  0.582244   
Richard      0.730502   0.664336  0.725619  0.630240  0.779778  0.587989   
Charles      0.717849   0.691631  0.729507  0.648721  0.775796  0.581864   
Thomas       0.741931   0.663853  0.741360  0.625551  0.804401  0.576369   
Christopher  0.756968   0.651688  0.733535  0.589201  0.806338  0.591051   
Daniel       0.776290   0.650030 

In [42]:
# Female Names
# Pleasant Words
similarities_FvP = cosine_similarity(Female_Embeddings, Pleasant_Embeddings)
# Unpleasant Words
similarities_FvU = cosine_similarity(Female_Embeddings, Unpleasant_Embeddings)

similarities_FvP = pd.DataFrame(similarities_FvP, index = Female_Names, columns = Pleasant_Words)
similarities_FvU = pd.DataFrame(similarities_FvU, index = Female_Names, columns = Unpleasant_Words)

print("Cosine Similarity Matrix: Female Names vs Pleasant Words")
print(similarities_FvP)
print("Cosine Similarity Matrix: Female Names vs Unpleasant Words")
print(similarities_FvU)

similarities_FvP.to_csv('BERT_base_cased_FvP.csv', index = True)
similarities_FvU.to_csv('BERT_base_cased_FvU.csv', index = True)

Cosine Similarity Matrix: Female Names vs Pleasant Words
              happy  agreeable    polite     civil  charming  gracious  \
Mary       0.761429   0.654815  0.748546  0.597887  0.818544  0.638291   
Elizabeth  0.703669   0.649813  0.714691  0.629124  0.750183  0.584110   
Patricia   0.745577   0.687537  0.763339  0.640642  0.800270  0.603197   
Jennifer   0.781565   0.640884  0.747535  0.578464  0.831570  0.591445   
Linda      0.756707   0.688496  0.746467  0.638815  0.804329  0.583984   
Barbara    0.713708   0.657614  0.713794  0.634163  0.750501  0.575018   
Margaret   0.649873   0.652149  0.679634  0.643849  0.677558  0.565292   
Susan      0.774417   0.661732  0.754990  0.602863  0.817011  0.587846   
Dorothy    0.727479   0.654933  0.717493  0.646615  0.759421  0.569782   
Sarah      0.791057   0.652335  0.747057  0.596670  0.834395  0.620116   
Jessica    0.788570   0.633518  0.735988  0.565356  0.842707  0.585603   
Helen      0.778523   0.653649  0.740755  0.587707  0.8

# TEST 3: Gender Biases in Careers

In [43]:
# Male Names
# STEM Careers
similarities_MvS = cosine_similarity(Male_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_MvN = cosine_similarity(Male_Embeddings, Non_STEM_Embeddings)

similarities_MvS = pd.DataFrame(similarities_MvS, index = Male_Names, columns = STEM_Careers)
similarities_MvN = pd.DataFrame(similarities_MvN, index = Male_Names, columns = Non_STEM_Careers)

print("Cosine Similarity Matrix: Male Names vs STEM Careers")
print(similarities_MvS)
print("Cosine Similarity Matrix: Male Names vs Non-STEM Careers")
print(similarities_MvN)

similarities_MvS.to_csv('BERT_base_cased_MvS.csv', index = True)
similarities_MvN.to_csv('BERT_base_cased_MvN.csv', index = True)

Cosine Similarity Matrix: Male Names vs STEM Careers
             Software Developer  Nurse Practitioner  Health Services Manager  \
James                  0.599429            0.564888                 0.541915   
John                   0.591751            0.582337                 0.551144   
Robert                 0.613324            0.569287                 0.535666   
Michael                0.619155            0.585911                 0.545180   
William                0.588663            0.569451                 0.526834   
David                  0.576887            0.540213                 0.548637   
Joseph                 0.637130            0.593042                 0.582993   
Richard                0.582995            0.548710                 0.545192   
Charles                0.624249            0.606426                 0.541633   
Thomas                 0.610949            0.581891                 0.546687   
Christopher            0.617820            0.571468                

In [44]:
# Female Names
# STEM Careers
similarities_FvS = cosine_similarity(Female_Embeddings, STEM_Embeddings)
# Non-STEM Careers
similarities_FvN = cosine_similarity(Female_Embeddings, Non_STEM_Embeddings)

similarities_FvS = pd.DataFrame(similarities_FvS, index = Female_Names, columns = STEM_Careers)
similarities_FvN = pd.DataFrame(similarities_FvN, index = Female_Names, columns = Non_STEM_Careers)

print("Cosine Similarity Matrix: Female Names vs STEM Careers")
print(similarities_FvS)
print("Cosine Similarity Matrix: Female Names vs Non-STEM Careers")
print(similarities_FvN)

similarities_FvS.to_csv('BERT_base_cased_FvS.csv', index = True)
similarities_FvN.to_csv('BERT_base_cased_FvN.csv', index = True)

Cosine Similarity Matrix: Female Names vs STEM Careers
           Software Developer  Nurse Practitioner  Health Services Manager  \
Mary                 0.568754            0.563205                 0.527755   
Elizabeth            0.577915            0.594637                 0.533104   
Patricia             0.623632            0.629222                 0.564500   
Jennifer             0.595708            0.548935                 0.553280   
Linda                0.623720            0.629222                 0.553269   
Barbara              0.609831            0.626205                 0.558687   
Margaret             0.572103            0.617076                 0.522378   
Susan                0.598048            0.564146                 0.531239   
Dorothy              0.592929            0.610545                 0.555291   
Sarah                0.587027            0.561252                 0.525294   
Jessica              0.566091            0.518740                 0.535720   
Helen    