## Part 1: Wu-Palmer Comparison

In [316]:
import json
import nltk
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from nltk.corpus import wordnet as wn
from IPython.display import display, HTML

!pip install -U spacy
import spacy
!python -m spacy download en_core_web_md 
sp = spacy.load('en_core_web_md')

!pip install -U enchant
import enchant
from enchant.utils import levenshtein

!pip install -U textblob
import textblob
from textblob import TextBlob, Word

Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [317]:
emb_dict = {}
with open("embeddings_final/word_embeddings_25.txt") as embeddings:
    emb_dict = json.load(embeddings)

In [318]:
white_fem = {}
white_mal = {}
black_fem = {}
black_mal = {}
mixed_fem = {}
mixed_mal = {}
asian_fem = {}
asian_mal = {}
for key in emb_dict.keys():
    word_info = key.split("_")
    word = word_info[0]
    ethnic = word_info[1]
    gender = word_info[2]
    if ethnic == "white":
        if gender == "female":
            white_fem[word] = emb_dict[key]
        else:
            white_mal[word] = emb_dict[key]
    elif ethnic == "black":
        if gender == "female":
            black_fem[word] = emb_dict[key]
        else:
            black_mal[word] = emb_dict[key]
    elif ethnic == "mixed":
        if gender == "female":
            mixed_fem[word] = emb_dict[key]
        else:
            mixed_mal[word] = emb_dict[key]
    elif ethnic == "asian":
        if gender == "female":
            asian_fem[word] = emb_dict[key]
        else:
            asian_mal[word] = emb_dict[key]

In [345]:
def wu_palmer_calcs(dict1, dict2, sim_dict, dict_name, file_name):
    avg_file = open(f"embedding_similarities/skip_gram_wup/{file_name}.txt", "a")
    for word in dict1.keys():
        counter = 0
        if len(dict1[word]) > 0:
            sim_dict[word] = 0
            if len(dict1[word]) > 0 and len(dict2[word]) > 0:
                for embed1 in dict1[word]:
                    for embed2 in dict2[word]:
                        text = f"{embed1} {embed2}"
                        tokens = sp(text)
                        sim_score = tokens[0].similarity(tokens[1])
                        syn1 = wn.synsets(embed1)
                        syn2 = wn.synsets(embed2)
                        if len(syn1) > 0 and len(syn2) > 0:
                            synonyms = []
                            c1 = 0
                            for s1 in syn1:
                                if s1.name().split(".")[0] == embed1:
                                    synonyms.append(s1)
                                    break
                            for s2 in syn2:
                                if s2.name().split(".")[0] == embed2:
                                    synonyms.append(s2)
                                    break
                            if len(synonyms) > 1:
                                synon1 = synonyms[0].name().split(".")[0]
                                synon2 = synonyms[1].name().split(".")[0]
                                wup = synonyms[0].wup_similarity(synonyms[1])
                                m = max(sim_score, wup)
                                sim_dict[word] += m
                            else:
                                sim_dict[word] += sim_score
                full_length = len(dict1[word]) * len(dict2[word])
                sim_dict[word] /= full_length
    sorted_avgs = dict(sorted(sim_dict.items(), key=lambda item: item[1]))
    for word in sorted_avgs.keys():
        avg_file.write(f"{dict_name}[{word}] = {sorted_avgs[word]}\n")
    avg_file.close()

In [346]:
white_black_fem_avg_sims = {}
wu_palmer_calcs(white_fem, black_fem, white_black_fem_avg_sims, "white_black_fem_avg_sims", "white_black_f_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [347]:
white_black_mal_avg_sims = {}
wu_palmer_calcs(white_mal, black_mal, white_black_mal_avg_sims, "white_black_mal_avg_sims", "white_black_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [348]:
white_asian_fem_avg_sims = {}
wu_palmer_calcs(white_fem, asian_fem, white_asian_fem_avg_sims, "white_asian_fem_avg_sims", "white_asian_f_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [349]:
white_asian_mal_avg_sims = {}
wu_palmer_calcs(white_mal, asian_mal, white_asian_mal_avg_sims, "white_asian_mal_avg_sims", "white_asian_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [350]:
white_mixed_fem_avg_sims = {}
wu_palmer_calcs(white_fem, mixed_fem, white_mixed_fem_avg_sims, "white_mixed_fem_avg_sims", "white_mixed_f_avg_sim_skipgram")

white_mixed_mal_avg_sims = {}
wu_palmer_calcs(white_mal, mixed_mal, white_mixed_mal_avg_sims, "white_mixed_mal_avg_sims", "white_mixed_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [351]:
mixed_black_fem_avg_sims = {}
wu_palmer_calcs(mixed_fem, black_fem, mixed_black_fem_avg_sims, "mixed_black_fem_avg_sims", "mixed_black_f_avg_sim_skipgram")

mixed_black_mal_avg_sims = {}
wu_palmer_calcs(mixed_mal, black_mal, mixed_black_mal_avg_sims, "mixed_black_mal_avg_sims", "mixed_black_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [352]:
asian_black_fem_avg_sims = {}
wu_palmer_calcs(asian_fem, black_fem, asian_black_fem_avg_sims, "asian_black_fem_avg_sims", "asian_black_f_avg_sim_skipgram")

asian_black_mal_avg_sims = {}
wu_palmer_calcs(asian_mal, black_mal, asian_black_mal_avg_sims, "asian_black_mal_avg_sims", "asian_black_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [353]:
asian_mixed_fem_avg_sims = {}
wu_palmer_calcs(asian_fem, mixed_fem, asian_mixed_fem_avg_sims, "asian_mixed_fem_avg_sims", "asian_mixed_f_avg_sim_skipgram")

asian_mixed_mal_avg_sims = {}
wu_palmer_calcs(asian_mal, mixed_mal, asian_mixed_mal_avg_sims, "asian_mixed_mal_avg_sims", "asian_mixed_m_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [354]:
black_genders_avg_sims = {}
wu_palmer_calcs(black_fem, black_mal, black_genders_avg_sims, "black_genders_avg_sims", "black_genders_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [355]:
white_genders_avg_sims = {}
wu_palmer_calcs(white_fem, white_mal, white_genders_avg_sims, "white_genders_avg_sims", "white_genders_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [356]:
mixed_genders_avg_sims = {}
wu_palmer_calcs(mixed_fem, mixed_mal, mixed_genders_avg_sims, "mixed_genders_avg_sims", "mixed_genders_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [357]:
asian_genders_avg_sims = {}
wu_palmer_calcs(asian_fem, asian_mal, asian_genders_avg_sims, "asian_genders_avg_sims", "asian_genders_avg_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [358]:
!tar czf wu_palmer_comparisons.tar.gz embedding_similarities/skip_gram_wup/*.txt

## Part 2: Cosine Similarity

In [320]:
from gensim.models import Word2Vec
from scipy import spatial

In [321]:
def print_vocab(model, top_n = None):
    count = 0
    if top_n is not None:
        for index, word in enumerate(model.wv.index_to_key):
            count+= 1
            if count < top_n:
                print(f"WORD #{index}/{len(model.wv.index_to_key)} IS: {word}")

In [322]:
def intersection_align_gensim(m1, m2, words=None):
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    for m in [m1, m2]:
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [323]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    m = other_vecs.T.dot(base_vecs) 
    u, _, v = np.linalg.svd(m)
    ortho = u.dot(v) 
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

In [324]:
white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
white_mal_mod = Word2Vec.load("white_models/cbow_w3_f1_1000")

black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")
black_mal_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_mc2")

mixed_fem_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
mixed_mal_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")

asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
asian_mal_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000_mc2")

In [325]:
def cosine_similarity(model1, model2, word):
  sc = 1 - spatial.distance.cosine(model1.wv[word], model2.wv[word])
  return sc

In [326]:
# black and white fem
smart_procrustes_align_gensim(white_fem_mod, black_fem_mod, words=None)
cosine_sim_wbf = pd.DataFrame(([w, cosine_similarity(white_fem_mod, black_fem_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , black_fem_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-black female comparison")
# black and white male
smart_procrustes_align_gensim(white_mal_mod, black_mal_mod, words=None)
cosine_sim_wbm = pd.DataFrame(([w, cosine_similarity(white_mal_mod, black_mal_mod, w), white_mal_mod.wv.get_vecattr(w, "count") , black_mal_mod.wv.get_vecattr(w, "count") ] for w in white_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-black male comparison")

# reset wf, bf, wm, bm
white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")
white_mal_mod = Word2Vec.load("white_models/cbow_w3_f1_1000")
black_mal_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_mc2")

# white and asian fem
smart_procrustes_align_gensim(white_fem_mod, asian_fem_mod, words=None)
cosine_sim_waf = pd.DataFrame(([w, cosine_similarity(white_fem_mod, asian_fem_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , asian_fem_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-asian female comparison")
# white and asian male
smart_procrustes_align_gensim(white_mal_mod, asian_mal_mod, words=None)
cosine_sim_wam = pd.DataFrame(([w, cosine_similarity(white_mal_mod, asian_mal_mod, w), white_mal_mod.wv.get_vecattr(w, "count") , asian_mal_mod.wv.get_vecattr(w, "count") ] for w in white_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-asian male comparison")

# reset wf, af, wm, am
white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
white_mal_mod = Word2Vec.load("white_models/cbow_w3_f1_1000")
asian_mal_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000_mc2")

# white and mixed fem
smart_procrustes_align_gensim(white_fem_mod, mixed_fem_mod, words=None)
cosine_sim_wmf = pd.DataFrame(([w, cosine_similarity(white_fem_mod, mixed_fem_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , mixed_fem_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-mixed female comparison")
# white and asian male
smart_procrustes_align_gensim(white_mal_mod, mixed_mal_mod, words=None)
cosine_sim_wmm = pd.DataFrame(([w, cosine_similarity(white_mal_mod, mixed_mal_mod, w), white_mal_mod.wv.get_vecattr(w, "count") , mixed_mal_mod.wv.get_vecattr(w, "count") ] for w in white_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white-mixed male comparison")

# reset wf, mf, wm, mm
white_fem_mod = Word2Vec.load("white_models/cbow_w10_f1_1000_ns_half_neg2")
mixed_fem_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
white_mal_mod = Word2Vec.load("white_models/cbow_w3_f1_1000")
mixed_mal_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")

# black and asian fem
smart_procrustes_align_gensim(black_fem_mod, asian_fem_mod, words=None)
cosine_sim_baf = pd.DataFrame(([w, cosine_similarity(black_fem_mod, asian_fem_mod, w), black_fem_mod.wv.get_vecattr(w, "count") , asian_fem_mod.wv.get_vecattr(w, "count") ] for w in black_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black-asian female comparison")
# black and asian male
smart_procrustes_align_gensim(black_mal_mod, asian_mal_mod, words=None)
cosine_sim_bam = pd.DataFrame(([w, cosine_similarity(black_mal_mod, asian_mal_mod, w), black_mal_mod.wv.get_vecattr(w, "count") , asian_mal_mod.wv.get_vecattr(w, "count") ] for w in black_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black-asian male comparison")

# reset af, bf, am, bm
asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")
asian_mal_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000_mc2")
black_mal_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_mc2")

# black and mixed fem
smart_procrustes_align_gensim(black_fem_mod, mixed_fem_mod, words=None)
cosine_sim_bmf = pd.DataFrame(([w, cosine_similarity(black_fem_mod, mixed_fem_mod, w), black_fem_mod.wv.get_vecattr(w, "count") , mixed_fem_mod.wv.get_vecattr(w, "count") ] for w in black_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black-mixed female comparison")
# black and mixed male
smart_procrustes_align_gensim(black_mal_mod, mixed_mal_mod, words=None)
cosine_sim_bmm = pd.DataFrame(([w, cosine_similarity(black_mal_mod, mixed_mal_mod, w), black_mal_mod.wv.get_vecattr(w, "count") , mixed_mal_mod.wv.get_vecattr(w, "count") ] for w in black_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black-asian male comparison")

# reset mf, bf, mm, bm
mixed_fem_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
black_fem_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_ns_half_neg3")
mixed_mal_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
black_mal_mod = Word2Vec.load("black_models/cbow_w10_f1_1000_mc2")

# asian and mixed fem
smart_procrustes_align_gensim(asian_fem_mod, mixed_fem_mod, words=None)
cosine_sim_amf = pd.DataFrame(([w, cosine_similarity(asian_fem_mod, mixed_fem_mod, w), asian_fem_mod.wv.get_vecattr(w, "count") , mixed_fem_mod.wv.get_vecattr(w, "count") ] for w in asian_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for asian-mixed female comparison")
# asian and mixed male
smart_procrustes_align_gensim(asian_mal_mod, mixed_mal_mod, words=None)
cosine_sim_amm = pd.DataFrame(([w, cosine_similarity(asian_mal_mod, mixed_mal_mod, w), asian_mal_mod.wv.get_vecattr(w, "count") , mixed_mal_mod.wv.get_vecattr(w, "count") ] for w in asian_mal_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for asian-mixed male comparison")

# reset mf, af, mm, am
mixed_fem_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
asian_fem_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000")
mixed_mal_mod = Word2Vec.load("mixed_models/cbow_w10_f1_1000_mc2")
asian_mal_mod = Word2Vec.load("asian_models/cbow_w10_f1_1000_mc2")

# white fem and male
smart_procrustes_align_gensim(white_fem_mod, white_mal_mod, words=None)
cosine_sim_w_gend = pd.DataFrame(([w, cosine_similarity(white_fem_mod, white_mal_mod, w), white_fem_mod.wv.get_vecattr(w, "count") , white_mal_mod.wv.get_vecattr(w, "count") ] for w in white_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for white male & female comparison")

# black fem and male
smart_procrustes_align_gensim(black_fem_mod, black_mal_mod, words=None)
cosine_sim_b_gend = pd.DataFrame(([w, cosine_similarity(black_fem_mod, black_mal_mod, w), black_fem_mod.wv.get_vecattr(w, "count") , black_mal_mod.wv.get_vecattr(w, "count") ] for w in black_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for black male & female comparison")

# mixed fem and male
smart_procrustes_align_gensim(mixed_fem_mod, mixed_mal_mod, words=None)
cosine_sim_m_gend = pd.DataFrame(([w, cosine_similarity(mixed_fem_mod, mixed_mal_mod, w), mixed_fem_mod.wv.get_vecattr(w, "count") , mixed_mal_mod.wv.get_vecattr(w, "count") ] for w in mixed_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for mixed male & female comparison")

# asian fem and male
smart_procrustes_align_gensim(asian_fem_mod, asian_mal_mod, words=None)
cosine_sim_a_gend = pd.DataFrame(([w, cosine_similarity(asian_fem_mod, asian_mal_mod, w), asian_fem_mod.wv.get_vecattr(w, "count") , asian_mal_mod.wv.get_vecattr(w, "count") ] for w in asian_fem_mod.wv.index_to_key), columns = ('word', 'similarity', "frequency_t1", "frequency_t2"))
print("Created dataframe for asian male & female comparison")

5207 5207
5207 5207
Created dataframe for white-black female comparison
1843 1843
1843 1843
Created dataframe for white-black male comparison
2910 2910
2910 2910
Created dataframe for white-asian female comparison
1074 1074
1074 1074
Created dataframe for white-asian male comparison
1001 1001
1001 1001
Created dataframe for white-mixed female comparison
933 933
933 933
Created dataframe for white-mixed male comparison
2742 2742
2742 2742
Created dataframe for black-asian female comparison
864 864
864 864
Created dataframe for black-asian male comparison
988 988
988 988
Created dataframe for black-mixed female comparison
738 738
738 738
Created dataframe for black-asian male comparison
888 888
888 888
Created dataframe for asian-mixed female comparison
585 585
585 585
Created dataframe for asian-mixed male comparison
4855 4855
4855 4855
Created dataframe for white male & female comparison
1865 1865
1865 1865
Created dataframe for black male & female comparison
Created dataframe for mixe

In [327]:
def calc_cos_sim(eth_gen_dict, word_list, file_name, cos_sim_dict):        
    comparisons = cos_sim_dict[cos_sim_dict['word'].isin(word_list)]
    comparisons.sort_values(by=['similarity'], inplace=True)
    for index in range(comparisons.shape[0]):
        eth_gen_dict[comparisons['word'].iloc[index]] = comparisons['similarity'].iloc[index]
    with open(f'embedding_similarities/cosine_sim/{file_name}.txt', 'w') as curr_file:
        json.dump(eth_gen_dict, curr_file, indent=2)

In [328]:
white_black_fem_cosine = {}
calc_cos_sim(white_black_fem_cosine,  white_black_fem_avg_sims.keys(), "white_black_fem_cos_sim", cosine_sim_wbf)

In [329]:
white_black_mal_cosine = {}
calc_cos_sim(white_black_mal_cosine,  white_black_mal_avg_sims.keys(), "white_black_mal_cos_sim", cosine_sim_wbm)

In [330]:
white_asian_fem_cosine = {}
calc_cos_sim(white_asian_fem_cosine,  white_asian_fem_avg_sims.keys(), "white_asian_fem_cos_sim", cosine_sim_waf)

In [331]:
white_asian_mal_cosine = {}
calc_cos_sim(white_asian_mal_cosine,  white_asian_mal_avg_sims.keys(), "white_asian_mal_cos_sim", cosine_sim_wam)

In [332]:
white_mixed_fem_cosine = {}
calc_cos_sim(white_mixed_fem_cosine,  white_mixed_fem_avg_sims.keys(), "white_mixed_fem_cos_sim", cosine_sim_wmf)

In [333]:
white_mixed_mal_cosine = {}
calc_cos_sim(white_mixed_mal_cosine,  white_mixed_mal_avg_sims.keys(), "white_mixed_mal_cos_sim", cosine_sim_wmm)

In [334]:
black_asian_fem_cosine = {}
calc_cos_sim(black_asian_fem_cosine,  asian_black_fem_avg_sims.keys(), "black_asian_fem_cos_sim", cosine_sim_baf)

In [335]:
black_asian_mal_cosine = {}
calc_cos_sim(black_asian_mal_cosine,  asian_black_mal_avg_sims.keys(), "black_asian_mal_cos_sim", cosine_sim_bam)

In [336]:
black_mixed_fem_cosine = {}
calc_cos_sim(black_mixed_fem_cosine,  mixed_black_fem_avg_sims.keys(), "black_mixed_fem_cos_sim", cosine_sim_bmf)

In [337]:
black_mixed_mal_cosine = {}
calc_cos_sim(black_mixed_mal_cosine, mixed_black_mal_avg_sims.keys(), "black_mixed_mal_cos_sim", cosine_sim_bmm)

In [338]:
asian_mixed_fem_cosine = {}
calc_cos_sim(asian_mixed_fem_cosine,  asian_mixed_fem_avg_sims.keys(), "asian_mixed_fem_cos_sim", cosine_sim_amf)

In [339]:
asian_mixed_mal_cosine = {}
calc_cos_sim(asian_mixed_mal_cosine,  asian_mixed_mal_avg_sims.keys(), "asian_mixed_mal_cos_sim", cosine_sim_amm)

In [340]:
white_genders_cosine = {}
calc_cos_sim(white_genders_cosine,  white_genders_avg_sims.keys(), "white_genders_cos_sim", cosine_sim_w_gend)

In [341]:
black_genders_cosine = {}
calc_cos_sim(black_genders_cosine,  black_genders_avg_sims.keys(), "black_genders_cos_sim", cosine_sim_b_gend)

In [342]:
mixed_genders_cosine = {}
calc_cos_sim(mixed_genders_cosine,  mixed_genders_avg_sims.keys(), "mixed_genders_cos_sim", cosine_sim_m_gend)

In [343]:
asian_genders_cosine = {}
calc_cos_sim(asian_genders_cosine,  asian_genders_avg_sims.keys(), "asian_genders_cos_sim", cosine_sim_a_gend)

In [344]:
!tar czf comparisons_embeddings_cosine_sim.tar.gz embedding_similarities/cosine_sim/*.txt

## Part 3: Other Similarity Measures

In [420]:
def leacock_chodorow_calcs(dict1, dict2, sim_dict, dict_name, file_name):
    avg_file = open(f"embedding_similarities/skip_gram_lch/{file_name}.txt", "a")
    for word in dict1.keys():
        counter = 0
        if len(dict1[word]) > 0:
            sim_dict[word] = 0
            if len(dict1[word]) > 0 and len(dict2[word]) > 0:
                for embed1 in dict1[word]:
                    for embed2 in dict2[word]:
                        text = f"{embed1} {embed2}"
                        tokens = sp(text)
                        sim_score = tokens[0].similarity(tokens[1])
                        syn1 = wn.synsets(embed1)
                        syn2 = wn.synsets(embed2)
                        if len(syn1) > 0 and len(syn2) > 0:
                            synonyms = []
                            pos1 = ""
                            for s1 in syn1:
                                if s1.name().split(".")[0] == embed1:
                                    synonyms.append(s1)
                                    pos1 = s1.pos()
                                    break
                            for s2 in syn2:
                                if s2.name().split(".")[0] == embed2 and s2.pos() == pos1:
                                    synonyms.append(s2)
                                    break
                            if len(synonyms) > 1:
                                synon1 = synonyms[0].name().split(".")[0]
                                synon2 = synonyms[1].name().split(".")[0]
                                lch = synonyms[0].lch_similarity(synonyms[1])
                                m = max(sim_score, lch)
                                sim_dict[word] += m
                            else:
                                sim_dict[word] += sim_score
                full_length = len(dict1[word]) * len(dict2[word])
                sim_dict[word] /= full_length
    sorted_avgs = dict(sorted(sim_dict.items(), key=lambda item: item[1]))
    for word in sorted_avgs.keys():
        avg_file.write(f"{dict_name}[{word}] = {sorted_avgs[word]}\n")
    avg_file.close()

In [421]:
white_black_fem_lch_sims = {}
leacock_chodorow_calcs(white_fem, black_fem, white_black_fem_lch_sims, "white_black_fem_lch_sims", "white_black_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [422]:
white_black_mal_lch_sims = {}
leacock_chodorow_calcs(white_mal, black_mal, white_black_mal_lch_sims, "white_black_mal_lch_sims", "white_black_m_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [423]:
white_asian_fem_lch_sims = {}
leacock_chodorow_calcs(white_fem, asian_fem, white_asian_fem_lch_sims, "white_asian_fem_lch_sims", "white_asian_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [424]:
white_asian_mal_lch_sims = {}
leacock_chodorow_calcs(white_mal, asian_mal, white_asian_mal_lch_sims, "white_asian_mal_lch_sims", "white_asian_m_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [425]:
white_mixed_fem_lch_sims = {}
leacock_chodorow_calcs(white_fem, mixed_fem, white_mixed_fem_lch_sims, "white_mixed_fem_lch_sims", "white_mixed_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [426]:
white_mixed_mal_lch_sims = {}
leacock_chodorow_calcs(white_mal, mixed_mal, white_mixed_mal_lch_sims, "white_mixed_mal_lch_sims", "white_mixed_m_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [427]:
black_mixed_fem_lch_sims = {}
leacock_chodorow_calcs(black_fem, mixed_fem, black_mixed_fem_lch_sims, "black_mixed_fem_lch_sims", "black_mixed_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [428]:
black_mixed_mal_lch_sims = {}
leacock_chodorow_calcs(black_mal, mixed_mal, black_mixed_mal_lch_sims, "black_mixed_mal_lch_sims", "black_mixed_m_lch_sim_skipgram")

In [429]:
black_asian_fem_lch_sims = {}
leacock_chodorow_calcs(black_fem, asian_fem, black_asian_fem_lch_sims, "black_asian_fem_lch_sims", "black_asian_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [430]:
black_asian_mal_lch_sims = {}
leacock_chodorow_calcs(black_mal, asian_mal, black_asian_mal_lch_sims, "black_asian_mal_lch_sims", "black_asian_m_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [431]:
mixed_asian_fem_lch_sims = {}
leacock_chodorow_calcs(mixed_fem, asian_fem, mixed_asian_fem_lch_sims, "mixed_asian_fem_lch_sims", "mixed_asian_f_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [432]:
mixed_asian_mal_lch_sims = {}
leacock_chodorow_calcs(mixed_mal, asian_mal, mixed_asian_mal_lch_sims, "mixed_asian_mal_lch_sims", "mixed_asian_m_lch_sim_skipgram")

In [434]:
white_genders_lch_sims = {}
leacock_chodorow_calcs(white_fem, white_mal, white_genders_lch_sims, "white_genders_lch_sims", "white_genders_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [435]:
black_genders_lch_sims = {}
leacock_chodorow_calcs(black_fem, black_mal, black_genders_lch_sims, "black_genders_lch_sims", "black_genders_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [436]:
mixed_genders_lch_sims = {}
leacock_chodorow_calcs(mixed_fem, mixed_mal, mixed_genders_lch_sims, "mixed_genders_lch_sims", "mixed_genders_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [437]:
asian_genders_lch_sims = {}
leacock_chodorow_calcs(asian_fem, asian_mal, asian_genders_lch_sims, "asian_genders_lch_sims", "asian_genders_lch_sim_skipgram")

  sim_score = tokens[0].similarity(tokens[1])


In [438]:
!tar czf leacock_chodorow_comparisons.tar.gz embedding_similarities/skip_gram_lch/*.txt