1. Visualize word embeddings (NLP specialization; t-SNE, Week 2 of Sequence Models)
2. De-bias word embeddings (Week 2 of Sequence Models)
3. Neural Machine Translation (Week 3 of Sequence Models) - problem: supervised!

Bleu Score for validating hypothesis?

# Playground for the visualization of word embeddings

#### word2vec

#### GloVe

In [1]:

# https://towardsdatascience.com/visualizing-word-embedding-with-pca-and-t-sne-961a692509f5


#### BERT

In [None]:
# simplerepresentations
# spacy
# torch
# transformers
# gensim

# Operations on Word Vectors

In [3]:
import numpy as np
from w2v_utils import read_glove_vecs

In [4]:
# Load the 50-dimensional GloVe word vectors
words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [5]:
# TODO: replace with commodity library function (spacy, gensim)

def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similarity between u and v
        
    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """
    
    # Special case. Consider the case u = [0, 0], v=[0, 0]
    if np.all(u == v):
        return 1
    
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(u**2))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(v**2))
    
    # Avoid division by 0
    if np.isclose(norm_u * norm_v, 0, atol=1e-32):
        return 0
    
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / (norm_u * norm_v)
    
    return cosine_similarity

In [6]:
# Load word vectors
man = word_to_vec_map["man"]
doctor = word_to_vec_map["doctor"]
woman = word_to_vec_map["woman"]
nurse = word_to_vec_map["nurse"]

# Demonstrate gender bias
man_to_doctor_as_woman_to_nurse = cosine_similarity(man, doctor) - cosine_similarity(woman, nurse)
man_to_nurse_as_woman_to_doctor = cosine_similarity(man, nurse) - cosine_similarity(woman, doctor)

# Print results
print(round(man_to_doctor_as_woman_to_nurse, 2))
print(round(man_to_nurse_as_woman_to_doctor, 2))
print(f"man-doctor:\t{round(cosine_similarity(man, doctor), 2)}")
print(f"woman-nurse:\t{round(cosine_similarity(woman, nurse), 2)}")
print(f"man-woman:\t{round(cosine_similarity(man, woman), 2)}")
print(f"doctor-nurse:\t{round(cosine_similarity(doctor, nurse), 2)}")
print(f"man-nurse:\t{round(cosine_similarity(man, nurse), 2)}")
print(f"woman-doctor:\t{round(cosine_similarity(woman, doctor), 2)}")

-0.0
-0.15
man-doctor:	0.71
woman-nurse:	0.72
man-woman:	0.89
doctor-nurse:	0.8
man-nurse:	0.57
woman-doctor:	0.73


In [7]:
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
    """
    Performs the word analogy task: a is to b as c is to ____. 
    
    Arguments:
    word_a -- a word, string
    word_b -- a word, string
    word_c -- a word, string
    word_to_vec_map -- dictionary that maps words to their corresponding vectors. 
    
    Returns:
    best_word --  the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
    """
    
    # Convert words to lowercase
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    # Get the word embeddings e_a, e_b and e_c (≈1-3 lines)
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    
    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output
    
    # Loop over the whole word vector set
    for w in words:   
        # To avoid best_word being one of the input words, skip the input word_c
        # Skip word_c from query
        if w == word_c:
            continue
        
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # Then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        
    return best_word

In [8]:
# Define triads to try analogies
triads = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]

# Print results
for triad in triads:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad, word_to_vec_map)))

italy -> italian :: spain -> spanish
india -> delhi :: japan -> tokyo
man -> woman :: boy -> girl
small -> smaller :: large -> smaller


# Debiasing Word Vectors

In [15]:
# Calculate distance between gender pair woman-man
woman_man = word_to_vec_map['woman'] - word_to_vec_map['man']

# Calculate distance between gender pair mother-father
mother_father = word_to_vec_map['mother'] - word_to_vec_map['father']

# Calculate distance between gender pair girl-boy
girl_boy = word_to_vec_map['girl'] - word_to_vec_map['boy']

# Average over the gender pairs to get a simple representation of gender
gender = np.average([woman_man, mother_father, girl_boy], axis=0)

# Print gender vector
print(gender)

[ 0.07656667  0.34967667 -0.40057667 -0.03130333  0.0088      0.72586333
  0.10256     0.14906333  0.4780662  -0.22850987  0.05957667 -0.68663
  0.62210033  0.10395     0.17747667  0.09556867 -0.49258333 -0.17066233
  0.46930033  0.02196333  0.28145667  0.50513333  0.17144733  0.40154767
  0.24039333  0.1646     -0.17984667  0.24042667  0.05689333 -0.31423
 -0.10933333  0.26355967  0.06100667 -0.01156405 -0.12236333 -0.188245
 -0.13215057 -0.068186    0.05624667 -0.29555567 -0.09669533 -0.29559667
  0.62465867 -0.40130167  0.03330667 -0.24831667  0.26381667 -0.28738333
  0.03020433  0.054106  ]
