# playing around with glove 6B 50-d data


- **list of things to play with the data**
 - Finding the cosine similarity
 - Analogy game. (man->woman, king: ?)
 - Debiasing word vectors
 

In [20]:
import numpy as np
import time

> Read the data. 


In [2]:

def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

In [3]:
words, word_to_vec_map = read_glove_vecs('./data/glove.6B.50d.txt')

## Cosine Similarity

In [4]:
def cosine_similarity(u,v):
    dot = np.dot(u,v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    
    cosine_similarity = dot/(norm_u*norm_v)
    return cosine_similarity



In [5]:
name = word_to_vec_map['name']
title = word_to_vec_map['title']

print('Cosine sim of name , title', cosine_similarity(name, title))

#as cosine is symetric a.b = b.a cosine sim bt a, b = b, a

print('Cosine sim of title , name', cosine_similarity(title, name))

Cosine sim of name , title 0.6283471929719754
Cosine sim of title , name 0.6283471929719754


## Analogy game 

In [6]:
def analogy_game(word_a, word_b, word_c, word_to_vec_map):
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    
    words = word_to_vec_map.keys()
    
    max_cosine_sim = -100 #lets say very negative
    best_word = None # we don't know for now
    tick = time.time()
    for w in words:
        if w in [word_a, word_b, word_c]:
            continue
        
        cosine_sim = cosine_similarity(e_a-e_b, e_c-word_to_vec_map[w])
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
    tock = time.time()
    time_taken = tock-tick
            
    return best_word, time_taken


In [7]:
#print(analogy_game('girl', 'cute', 'boy', word_to_vec_map))

triads = [('woman','sexy', 'man'), ('man','hot', 'woman'), ('boy', 'young', 'girl'),('name', 'title', 'beer'),
         ('pain', 'sorrow', 'laugh'), ('lion','deer','zebra')]
#PLAY AROUND BY ADDING MORE TO THE LIST LIKE NAME->TITLE, BEER->
for triad in triads:
    best_word , time_taken = analogy_game(*triad, word_to_vec_map)
    print('{}->{},{}->{},{}'.format(*triad, best_word, time_taken))

woman->sexy,man->retro,4.4205710887908936
man->hot,woman->chill,4.115301132202148
boy->young,girl->fellow,4.053421258926392
name->title,beer->100m,4.028311729431152
pain->sorrow,laugh->jodhaa,4.085860967636108
lion->deer,zebra->white-tailed,4.098408937454224


## Debiasing Word Vectors for gender



In [9]:
g1 = word_to_vec_map['woman'] - word_to_vec_map['man']
g2 = word_to_vec_map['mother'] -word_to_vec_map['father']
g3 = word_to_vec_map['girl'] - word_to_vec_map['boy']
g = g1+g2+g3
g = g/3

print(g)

[ 0.07656667  0.34967667 -0.40057667 -0.03130333  0.0088      0.72586333
  0.10256     0.14906333  0.4780662  -0.22850987  0.05957667 -0.68663
  0.62210033  0.10395     0.17747667  0.09556867 -0.49258333 -0.17066233
  0.46930033  0.02196333  0.28145667  0.50513333  0.17144733  0.40154767
  0.24039333  0.1646     -0.17984667  0.24042667  0.05689333 -0.31423
 -0.10933333  0.26355967  0.06100667 -0.01156405 -0.12236333 -0.188245
 -0.13215057 -0.068186    0.05624667 -0.29555567 -0.09669533 -0.29559667
  0.62465867 -0.40130167  0.03330667 -0.24831667  0.26381667 -0.28738333
  0.03020433  0.054106  ]


**check what words means what in bias axis**

In [10]:
print ('List of names and their similarities with constructed vector:')

# girls and boys name
name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']

for w in name_list:
    print (w, cosine_similarity(word_to_vec_map[w], g))

List of names and their similarities with constructed vector:
john -0.30873091089769916
marie 0.34257515107827113
sophie 0.4116200252265307
ronaldo -0.290839785117324
priya 0.19646793448600458
rahul -0.19492147638633417
danielle 0.2923957653171286
reza -0.1679382162425299
katy 0.31132430605664335
yasmin 0.19658379893678699


In [11]:
print('Other words and their similarities:')
word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist', 
             'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
for w in word_list:
    print (w, cosine_similarity(word_to_vec_map[w], g))

Other words and their similarities:
lipstick 0.41366815126252465
guns -0.08755154639507805
science -0.05837425964384823
arts 0.011760468125783753
literature 0.02316946789375169
warrior -0.16564638100307952
doctor 0.0772141272665667
tree 0.03538042107098228
receptionist 0.30167259871100655
technology -0.16192108462558172
fashion 0.1416547219136271
teacher 0.10545901736578715
engineer -0.2263994415742677
pilot -0.03699357317847414
computer -0.16821031921735138
singer 0.2009300079322625


<img src="images/neutralize.png" style="width:800px;height:400px;"> 

<img src="images/neutral.png" style="width:800px;height:400px;"> 

What I observed is - 
 - receptionist
 - technology
 - fashion
 - teacher 
 - engineer
 - computer 
 - singer 
 
 these are the biased term towards some gender
 
 Equations : 
 
 
$$e^{bias\_component} = \frac{e \cdot g}{||g||_2^2} * g\tag{2}$$
$$e^{debiased} = e - e^{bias\_component}\tag{3}$$

In [13]:
def neutralize(word, g, word_to_vec_map):
    """
    Removes the bias of "word" by projecting it on the space orthogonal to the bias axis. 
    This function ensures that gender neutral words are zero in the gender subspace.
    
    Arguments:
        word -- string indicating the word to debias
        g -- numpy-array of shape (50,), corresponding to the bias axis (such as gender)
        word_to_vec_map -- dictionary mapping words to their corresponding vectors.
    
    Returns:
        e_debiased -- neutralized word vector representation of the input "word"
    """
    
    ### START CODE HERE ###
    # Select word vector representation of "word". Use word_to_vec_map. (≈ 1 line)
    e = word_to_vec_map[word]
    
    # Compute e_biascomponent using the formula give above. (≈ 1 line)
    e_biascomponent = (np.dot(e, g)/np.square(np.linalg.norm(g)))*g
 
    # Neutralize e by substracting e_biascomponent from it 
    # e_debiased should be equal to its orthogonal projection. (≈ 1 line)
    word = e - e_biascomponent
    ### END CODE HERE ###
    
    return e_debiased

In [19]:
words = ['receptionist', 'technology', 'fashion', 'teacher','engineer', 'computer','singer']
for word in words:
    print("cosine similarity between " + word + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map[word], g))

    word_debiased = neutralize(word, g, word_to_vec_map)
    
    print("cosine similarity between " + word + " and g, after neutralizing: ", cosine_similarity(word_debiased, g))
    print('\n')

cosine similarity between receptionist and g, before neutralizing:  0.30167259871100655
cosine similarity between receptionist and g, after neutralizing:  -2.6739863413957255e-17


cosine similarity between technology and g, before neutralizing:  -0.16192108462558172
cosine similarity between technology and g, after neutralizing:  -2.853148170277109e-17


cosine similarity between fashion and g, before neutralizing:  0.1416547219136271
cosine similarity between fashion and g, after neutralizing:  6.950184658989011e-17


cosine similarity between teacher and g, before neutralizing:  0.10545901736578715
cosine similarity between teacher and g, after neutralizing:  -3.4264800465992284e-17


cosine similarity between engineer and g, before neutralizing:  -0.2263994415742677
cosine similarity between engineer and g, after neutralizing:  7.258856232914164e-17


cosine similarity between computer and g, before neutralizing:  -0.16821031921735138
cosine similarity between computer and g, after

## Equalising by gender - Gender equality

<img src="images/equalize10.png" style="width:800px;height:400px;"> 

**Equations:**

The derivation of the linear algebra to do this is a bit more complex. (See Bolukbasi et al., 2016 for details.) But the key equations are: 

$$ \mu = \frac{e_{w1} + e_{w2}}{2}\tag{4}$$ 

$$ \mu_{B} = \frac {\mu \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{5}$$ 

$$\mu_{\perp} = \mu - \mu_{B} \tag{6}$$

$$ e_{w1B} = \frac {e_{w1} \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{7}$$ 
$$ e_{w2B} = \frac {e_{w2} \cdot \text{bias_axis}}{||\text{bias_axis}||_2^2} *\text{bias_axis}
\tag{8}$$


$$e_{w1B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w1B}} - \mu_B} {||(e_{w1} - \mu_{\perp}) - \mu_B||} \tag{9}$$


$$e_{w2B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w2B}} - \mu_B} {||(e_{w2} - \mu_{\perp}) - \mu_B||} \tag{10}$$

$$e_1 = e_{w1B}^{corrected} + \mu_{\perp} \tag{11}$$
$$e_2 = e_{w2B}^{corrected} + \mu_{\perp} \tag{12}$$


In [22]:
def equalize(pair, bias_axis, word_to_vec_map):    
    w1, w2 = pair
    e_w1, e_w2 = word_to_vec_map[w1], word_to_vec_map[w2]
    
    mu = (e_w1 + e_w2)/2
    # Step 3: Compute the projections of mu over the bias axis and the orthogonal axis (≈ 2 lines)
    mu_B = (np.dot(mu,bias_axis)/np.square(np.linalg.norm(bias_axis)))*bias_axis
    mu_orth = mu - mu_B

    # Step 4: Use equations (7) and (8) to compute e_w1B and e_w2B (≈2 lines)
    e_w1B = (np.dot(e_w1, bias_axis)/np.square(np.linalg.norm(bias_axis)))*bias_axis
    e_w2B = (np.dot(e_w2, bias_axis)/np.square(np.linalg.norm(bias_axis)))*bias_axis
        
    # Step 5: Adjust the Bias part of e_w1B and e_w2B using the formulas (9) and (10) given above (≈2 lines)
    corrected_e_w1B = np.sqrt(np.linalg.norm(1- np.square(np.linalg.norm(mu_orth)))) * (e_w1B - mu_B)/(np.linalg.norm((e_w1-mu_orth)-mu_B))
    corrected_e_w2B = np.sqrt(np.linalg.norm(1- np.square(np.linalg.norm(mu_orth)))) * (e_w2B - mu_B)/(np.linalg.norm((e_w2-mu_orth)-mu_B))


    # Step 6: Debias by equalizing e1 and e2 to the sum of their corrected projections (≈2 lines)
    e1 = corrected_e_w1B + mu_orth
    e2 = corrected_e_w2B + mu_orth
                                                                
    ### END CODE HERE ###
    
    return e1, e2

In [24]:
print("cosine similarities before equalizing:")
print("cosine_similarity(word_to_vec_map[\"girl\"], gender) = ", cosine_similarity(word_to_vec_map["girl"], g))
print("cosine_similarity(word_to_vec_map[\"boy\"], gender) = ", cosine_similarity(word_to_vec_map["boy"], g))

e1, e2 = equalize(("girl", "boy"), g, word_to_vec_map)
print("cosine similarities after equalizing:")
print("cosine_similarity(e1, gender) = ", cosine_similarity(e1, g))
print("cosine_similarity(e2, gender) = ", cosine_similarity(e2, g))

cosine similarities before equalizing:
cosine_similarity(word_to_vec_map["girl"], gender) =  0.447588899901815
cosine_similarity(word_to_vec_map["boy"], gender) =  0.15516909802751455
cosine similarities after equalizing:
cosine_similarity(e1, gender) =  0.6370186893403987
cosine_similarity(e2, gender) =  -0.6370186893403987
