## Part 1: Simple Wu-Parker Comparison

In [144]:
import json
import nltk
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from nltk.corpus import wordnet as wn
from IPython.display import display, HTML

!pip install -U spacy
import spacy
!python -m spacy download en_core_web_md 
sp = spacy.load('en_core_web_md')

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [145]:
emb_dict = {}
with open("embeddings_final/word_embeddings_skip_25.txt") as embeddings:
    emb_dict = json.load(embeddings)

In [147]:
white_fem = {}
white_mal = {}
black_fem = {}
black_mal = {}
mixed_fem = {}
mixed_mal = {}
asian_fem = {}
asian_mal = {}
for key in emb_dict.keys():
    word_info = key.split("_")
    word = word_info[0]
    ethnic = word_info[1]
    gender = word_info[2]
    if ethnic == "white":
        if gender == "female":
            white_fem[word] = emb_dict[key]
        else:
            white_mal[word] = emb_dict[key]
    elif ethnic == "black":
        if gender == "female":
            black_fem[word] = emb_dict[key]
        else:
            black_mal[word] = emb_dict[key]
    elif ethnic == "mixed":
        if gender == "female":
            mixed_fem[word] = emb_dict[key]
        else:
            mixed_mal[word] = emb_dict[key]
    elif ethnic == "asian":
        if gender == "female":
            asian_fem[word] = emb_dict[key]
        else:
            asian_mal[word] = emb_dict[key]

In [184]:
def wu_parker_calcs(dict1, dict2, sim_dict, dict_name, file_name):
    avg_file = open(f"embedding_similarities/skip_gram_wup/{file_name}.txt", "a")
    for word in dict1.keys():
        counter = 0
        if len(dict1[word]) > 0:
            dict1[word].sort()
            dict2[word].sort()
            sim_dict[word] = 0
            if len(dict1[word]) > 0 and len(dict2[word]) > 0:
                for embed1 in dict1[word]:
                    for embed2 in dict2[word]:
                        syn1 = wn.synsets(embed1)
                        syn2 = wn.synsets(embed2)
                        if len(syn1) > 0 and len(syn2) > 0:
                            synonyms = []
                            c1 = 0
                            for s1 in syn1:
                                if s1.name().split(".")[0] == embed1:
                                    synonyms.append(s1)
                                    break
                            for s2 in syn2:
                                if s2.name().split(".")[0] == embed2:
                                    synonyms.append(s2)
                                    break
                            if len(synonyms) > 1:
                                sim_dict[word] += synonyms[0].wup_similarity(synonyms[0])
                            else:
                                sim_dict[word] += syn1[0].wup_similarity(syn2[0])
                full_length = len(dict1[word]) * len(dict2[word])
                sim_dict[word] /= full_length
    sorted_avgs = dict(sorted(sim_dict.items(), key=lambda item: item[1]))
    for word in sorted_avgs.keys():
        avg_file.write(f"{dict_name}[{word}] = {sorted_avgs[word]}\n")
    avg_file.close()

In [185]:
white_black_fem_avg_sims = {}
wu_parker_calcs(white_fem, black_fem, white_black_fem_avg_sims, "white_black_fem_avg_sims", "white_black_f_avg_sim_skipgram")

white_black_mal_avg_sims = {}
wu_parker_calcs(white_mal, black_mal, white_black_mal_avg_sims, "white_black_mal_avg_sims", "white_black_m_avg_sim_skipgram")

In [186]:
white_asian_fem_avg_sims = {}
wu_parker_calcs(white_fem, asian_fem, white_asian_fem_avg_sims, "white_asian_fem_avg_sims", "white_asian_f_avg_sim_skipgram")

white_asian_mal_avg_sims = {}
wu_parker_calcs(white_mal, asian_mal, white_asian_mal_avg_sims, "white_asian_mal_avg_sims", "white_asian_m_avg_sim_skipgram")

In [187]:
white_mixed_fem_avg_sims = {}
wu_parker_calcs(white_fem, mixed_fem, white_mixed_fem_avg_sims, "white_mixed_fem_avg_sims", "white_male_f_avg_sim_skipgram")

white_mixed_mal_avg_sims = {}
wu_parker_calcs(white_mal, mixed_mal, white_mixed_mal_avg_sims, "white_mixed_mal_avg_sims", "white_male_m_avg_sim_skipgram")

In [188]:
mixed_black_fem_avg_sims = {}
wu_parker_calcs(mixed_fem, black_fem, mixed_black_fem_avg_sims, "mixed_black_fem_avg_sims", "mixed_black_f_avg_sim_skipgram")

mixed_black_mal_avg_sims = {}
wu_parker_calcs(mixed_mal, black_mal, mixed_black_mal_avg_sims, "mixed_black_mal_avg_sims", "mixed_black_m_avg_sim_skipgram")

In [189]:
asian_black_fem_avg_sims = {}
wu_parker_calcs(asian_fem, black_fem, asian_black_fem_avg_sims, "asian_black_fem_avg_sims", "asian_black_f_avg_sim_skipgram")

asian_black_mal_avg_sims = {}
wu_parker_calcs(asian_mal, black_mal, asian_black_mal_avg_sims, "asian_black_mal_avg_sims", "asian_black_m_avg_sim_skipgram")

In [190]:
black_genders_avg_sims = {}
wu_parker_calcs(black_fem, black_mal, black_genders_avg_sims, "black_genders_avg_sims", "black_genders_avg_sim_skipgram")

In [191]:
white_genders_avg_sims = {}
wu_parker_calcs(white_fem, white_mal, white_genders_avg_sims, "white_genders_avg_sims", "white_genders_avg_sim_skipgram")

In [192]:
mixed_genders_avg_sims = {}
wu_parker_calcs(mixed_fem, mixed_mal, mixed_genders_avg_sims, "mixed_genders_avg_sims", "mixed_genders_avg_sim_skipgram")

In [193]:
asian_genders_avg_sims = {}
wu_parker_calcs(asian_fem, asian_mal, asian_genders_avg_sims, "asian_genders_avg_sims", "asian_genders_avg_sim_skipgram")

In [194]:
!tar czf comparisons_embeddings.tar.gz embedding_similarities/skip_gram_wup/*.txt

## Part 2: Cosine Similarity

In [None]:
from gensim.models import Word2Vec
from scipy import spatial

In [None]:
def print_vocab(model, top_n = None):
    count = 0
    if top_n is not None:
        for index, word in enumerate(model.wv.index_to_key):
            count+= 1
            if count < top_n:
                print(f"WORD #{index}/{len(model.wv.index_to_key)} IS: {word}")

In [None]:
def intersection_align_gensim(m1, m2, words=None):
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    for m in [m1, m2]:
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [None]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    m = other_vecs.T.dot(base_vecs) 
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v) 
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

In [None]:
white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
white_mal_mod = Word2Vec.load("white_models/cbow_w3_f1_1000")

black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")
black_mal_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_mc2")

mixed_fem_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
mixed_mal_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")

asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
asian_mal_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000_mc2")

In [None]:
def cosine_similarity(model1, model2, word):
  sc = 1 - spatial.distance.cosine(model1.wv[word], model2.wv[word])
  return sc

In [None]:
smart_procrustes_align_gensim(white_fem_mod, black_fem_mod, words=None)
cosine_sim_wbf = pd.DataFrame(([w, cosine_similarity(white_fem_mod, black_fem_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , black_fem_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-black female comparison")

white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")

smart_procrustes_align_gensim(white_fem_mod, asian_fem_mod, words=None)
cosine_sim_waf = pd.DataFrame(([w, cosine_similarity(white_fem_mod, asian_fem_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , asian_fem_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-asian female comparison")

white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")

smart_procrustes_align_gensim(black_fem_mod, asian_fem_mod, words=None)
cosine_sim_waf = pd.DataFrame(([w, cosine_similarity(black_fem_mod, asian_fem_mod, w), black_fem_mod.wv.get_vecattr(w, "count") , asian_fem_mod.wv.get_vecattr(w, "count") ] for w in black_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black-asian female comparison")
asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")


In [None]:
print("White female model top vocab: ")
print_vocab(white_fem_mod, 15)
print("Black female model top vocab: ")
print_vocab(black_fem_mod, 15)
# print("Mixed female model top vocab: ")
# print_vocab(mixed_fem_mod, 15)
# print("Asian female model top vocab: ")
# print_vocab(asian_fem_mod, 15)

In [None]:
white_black_fem_cosine = {}
default_words_cosine = {}

word_list = white_black_avg_sims.keys()
defaults = ["please", "service", "time", "contact", "Team", "Dr", "mental"]

wb_fem_default = cosine_sim_wbf[cosine_sim_wbf['word'].isin(defaults)]
wb_fem_default.sort_values(by=['similarity'], inplace=True)
for index in range(wb_fem_default.shape[0]):
    default_words_cosine[wb_fem_default['word'].iloc[index]] = wb_fem_default['similarity'].iloc[index]
with open('embedding_similarities/wbf_defaults_cos_sim.txt', 'w') as default_file:
    json.dump(default_words_cosine, default_file, indent=2)

wb_fem_comparisons = cosine_sim_wbf[cosine_sim_wbf['word'].isin(word_list)]
wb_fem_comparisons.sort_values(by=['similarity'], inplace=True)
for index in range(wb_fem_comparisons.shape[0]):
    white_black_fem_cosine[wb_fem_comparisons['word'].iloc[index]] = wb_fem_comparisons['similarity'].iloc[index]
# with open('embedding_similarities/wbf_cos_sim.txt', 'w') as wbf_file:
#     json.dump(white_black_fem_cosine, wbf_file, indent=2)

In [None]:
white_asian_fem_cosine = {}
word_list = white_asian_avg_sims.keys()
wa_fem_comparisons = cosine_sim_waf[cosine_sim_waf['word'].isin(word_list)]
wa_fem_comparisons.sort_values(by=['similarity'], inplace=True)
display(wa_fem_comparisons)

for index in range(wa_fem_comparisons.shape[0]):
    white_asian_fem_cosine[wa_fem_comparisons['word'].iloc[index]] = wa_fem_comparisons['similarity'].iloc[index]
with open('embedding_similarities/waf_cos_sim.txt', 'w') as waf_file:
    json.dump(white_asian_fem_cosine, waf_file, indent=2)

## Part 3: NDCG

In [None]:
import sklearn
from sklearn.metrics import ndcg_score
score_wbf_black = ndcg_score(np.asarray([white_black_avg_sims.values()]), np.asarray([black_genders_avg_sims.values()]))
score_wbf_white = ndcg_score(np.asarray([white_black_avg_sims.values()]), np.asarray([white_genders_avg_sims.values()]))

print(f"BLACK M/F VS WHITE/BLACK F: {score_wbf_black}")
print(f"WHITE M/F VS WHITE/BLACK F: {score_wbf_white}")