In [1]:
import gensim
import numpy as np
import gzip

In [2]:
DEBUG = 0
# Load word2vec data from a file handle
def load_word2vec_format(f, max_num_words=None):
    """Loads word2vec data from a file handle.

    Similar to gensim.models.keyedvectors.KeyedVectors.load_word2vec_format
    but takes a file handle as input rather than a filename. This lets us use
    GFile. Also only accepts binary files.

    Args:
        f: file handle
        max_num_words: number of words to load. If None, load all.

    Returns:
        Word2vec data as keyedvectors.EuclideanKeyedVectors.
    """
    # Print Vocab Size and Vector Size
    header = f.readline()
    vocab_size, vector_size = (int(x) for x in header.rstrip().split())  # throws for invalid file format
    print("vocab_size = {}, vector_size =  {}".format(vocab_size, vector_size))

    # Instantiate gensim model
    result = gensim.models.keyedvectors.EuclideanKeyedVectors()
    result.vector_size = vector_size
    result.syn0 = np.zeros( (vocab_size, vector_size), dtype=np.float32)
     
    num_words = 0

    # Function: Add word
    def add_word(word, weights):
        word_id = len(result.vocab)
        if word in result.vocab:
            print("duplicate word '{}', ignoring all but first".format(word))
            return
        result.vocab[word] = gensim.models.keyedvectors.Vocab(index=word_id, count=vocab_size - word_id)
        result.syn0[word_id] = weights
        result.index2word.append(word)

    # Print how many words are being loaded
    if max_num_words and max_num_words < vocab_size:
        num_embeddings = max_num_words
    else:
        num_embeddings = vocab_size
    print("Loading {} embeddings".format(num_embeddings))
    
    binary_len = np.dtype(np.float32).itemsize * vector_size
    for _ in range(vocab_size):
        
        if (DEBUG):
            if (num_words % 200000 == 0):
                print(num_words)

        # mixed text and binary: read text first, then binary
        word = []
        while True:
            ch = f.read(1)
            if ch == b' ':
                break
            if ch == b'':
                raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
            if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                word.append(ch)

        word = gensim.utils.to_unicode(b''.join(word), encoding='utf-8', errors='strict')
        weights = np.frombuffer(f.read(binary_len), dtype=np.float32)
        add_word(word, weights)
        num_words = num_words + 1
        if max_num_words and num_words == max_num_words:
            break

    if result.syn0.shape[0] != len(result.vocab):
        print("duplicate words detected, shrinking matrix size from {} to {}".format(result.syn0.shape[0], len(result.vocab)))
    result.syn0 = np.ascontiguousarray(result.syn0[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.syn0.shape

    print("loaded {} matrix".format(result.syn0.shape))
    return result

In [3]:
# Print the candidate words for A - B + C
def print_knn(client, v, k):
    print("{} closest neighbors to C - A + B".format(k))
    for neighbor, score in client.similar_by_vector(v.flatten().astype(float), topn=k):
        print("{} : score={}".format(neighbor, score))

In [4]:
# Load analogies from a file
def load_analogies(filename):
    """
    Returns a list containing the analogies
    """
    analogies = []
    with open(filename, 'r') as fast_file:
        for line in fast_file:
            line = line.strip()

            # Skip comment lines (i.e. lines the start with ':')
            if (line[0] == ':'):
                continue
            
            # Ignore lines without exactly 4 words
            words = line.split()
            if (len(words) != 4):
                print("Invalid line -> {}".format(line))
                continue

            analogies.append(words)

    print("Loaded {} analogies".format(len(analogies)))
    return analogies

# Main Training Loop

In [5]:
# Load word embeddings
WORD2VEC_FILE = "GoogleNews-vectors-negative300.bin.gz"
MAX_NUM_WORDS = None
with gzip.GzipFile(fileobj = open(WORD2VEC_FILE, 'rb')) as f:
    client = load_word2vec_format(f, MAX_NUM_WORDS)

vocab_size = 3000000, vector_size =  300
Loading 3000000 embeddings
loaded (3000000, 300) matrix


In [6]:
# Word analogy example: A : B :: C : D
#  ->  D = C - A + B
NUM_ANALOGIES = 10

A = "woman"
B = "man"
C = "programmer"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)

10 closest neighbors to C - A + B
programmer : score=0.8918285965919495
programmers : score=0.5779235363006592
programer : score=0.5624995827674866
Programmer : score=0.5415105819702148
sysadmin : score=0.5366033911705017
Jon_Shiring : score=0.5260592699050903
coder : score=0.5256212949752808
modder : score=0.4957827031612396
animator : score=0.4935148358345032
engineer : score=0.4899101257324219


In [7]:
# Word analogy example: A : B :: C : D
#  ->  D = C - A + B
A = "man"
B = "woman"
C = "programmer"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)

10 closest neighbors to C - A + B
programmer : score=0.885962188243866
programmers : score=0.6040860414505005
computer_programmer : score=0.5623368620872498
coder : score=0.5616979598999023
Programmer : score=0.5576066374778748
programer : score=0.5161396861076355
graphic_designer : score=0.5139066576957703
coders : score=0.4876540005207062
designer : score=0.4822674095630646
librarian : score=0.4649229645729065


In [8]:
# Word analogy example: A : B :: C : D
#  ->  D = C - A + B
A = "man"
B = "woman"
C = "doctor"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)

10 closest neighbors to C - A + B
doctor : score=0.883492112159729
gynecologist : score=0.7276507019996643
nurse : score=0.6698512434959412
physician : score=0.6674121022224426
doctors : score=0.6649492979049683
pediatrician : score=0.6398377418518066
nurse_practitioner : score=0.6237459778785706
obstetrician : score=0.6188926696777344
midwife : score=0.6041982769966125
dentist : score=0.5999662280082703


In [9]:
# Word analogy example: A : B :: C : D
#  ->  D = C - A + B
A = "woman"
B = "man"
C = "doctor"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)

10 closest neighbors to C - A + B
doctor : score=0.8413019180297852
physician : score=0.6823903918266296
doctors : score=0.6239282488822937
surgeon : score=0.5908077955245972
dentist : score=0.570309042930603
cardiologist : score=0.5666105151176453
neurologist : score=0.5558009743690491
neurosurgeon : score=0.5432174801826477
internist : score=0.5405333042144775
urologist : score=0.5398820042610168


In [10]:
# Word analogy example: A : B :: C : D
#  ->  D = C - A + B
A = "man"
B = "programmer"
C = "woman"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)

10 closest neighbors to C - A + B
programmer : score=0.885962188243866
programmers : score=0.6040860414505005
computer_programmer : score=0.5623368620872498
coder : score=0.5616979598999023
Programmer : score=0.5576066374778748
programer : score=0.5161396861076355
graphic_designer : score=0.5139066576957703
coders : score=0.4876540005207062
designer : score=0.4822674095630646
librarian : score=0.4649229645729065


In [11]:
# Load analogies
ANALOGIES_FILE = "questions-words.txt"
analogies = load_analogies(ANALOGIES_FILE)

Loaded 19544 analogies


In [12]:
# Print some analogies
for x in analogies[:5]:
    print("{} is to {} as {} is to {}".format(*x))

Athens is to Greece as Baghdad is to Iraq
Athens is to Greece as Bangkok is to Thailand
Athens is to Greece as Beijing is to China
Athens is to Greece as Berlin is to Germany
Athens is to Greece as Bern is to Switzerland


In [13]:
# Return the input vector normalized
def normalize_vector(v):
    return v / np.linalg.norm(v)

# Load and return normalized vectors
def load_embeddings(client, analogies):

    # Get and count the unique words in analogies
    # i.e. update = Union operation
    words_unfiltered = set()
    for i in range(len(analogies)):
        words_unfiltered.update(analogies[i])

    print("Found {} unique words".format(len(words_unfiltered)))

    # Normalize word vectors
    words = []
    vect = []
    index_map = {}
    for word in words_unfiltered:
        try:
            vect.append(normalize_vector(client.word_vec(word)))
            index_map[word] = len(words)
            words.append(word)
        except KeyError:
            print("Word not found: {}".format(word))

    print("Number of words not filtered out: {}/{}".format(len(words), len(words_unfiltered)))

    return np.array(vect), index_map, words

In [14]:
embeddings, index_map, word_list = load_embeddings(client, analogies)

Found 905 unique words
Number of words not filtered out: 905/905


In [15]:
def find_gender_direction(embed,
                          indices):
  """Finds and returns a 'gender direction'."""
  pairs = [
      ("woman", "man"),
      ("her", "his"),
      ("she", "he"),
      ("aunt", "uncle"),
      ("niece", "nephew"),
      ("daughters", "sons"),
      ("mother", "father"),
      ("daughter", "son"),
      ("granddaughter", "grandson"),
      ("girl", "boy"),
      ("stepdaughter", "stepson"),
      ("mom", "dad"),
  ]
  m = []
  for wf, wm in pairs:
    m.append(embed[indices[wf]] - embed[indices[wm]])
  m = np.array(m)

  # the next three lines are just a PCA.
  m = np.cov(np.array(m).T)
  evals, evecs = np.linalg.eig(m)
  return normalize_vector(np.real(evecs[:, np.argmax(evals)]))

In [16]:
gender_direction = find_gender_direction(embeddings, index_map)
#print("gender direction: %s" % str(gender_direction.flatten()))
print(gender_direction.shape)

(300,)


In [17]:
WORD = "master"

word_vec = client.word_vec(WORD)
print(word_vec.dot(gender_direction))

-0.30643988058688776


In [18]:
import pandas as pd

words = set()
for a in analogies:
  words.update(a)

df = pd.DataFrame(data={"word": list(words)})
df["gender_score"] = df["word"].map(
    lambda w: client.word_vec(w).dot(gender_direction))
df.sort_values(by="gender_score", inplace=True)
print(df.head(10))

             word  gender_score
864           his     -0.660903
792            he     -0.584860
442  unimpressive     -0.544877
684       Anaheim     -0.503144
629         Libya     -0.486664
428       playing     -0.472259
670          play     -0.459254
450      sharpest     -0.455905
734       Detroit     -0.454509
576        calmly     -0.448576


In [19]:
print(df.tail(10))

            word  gender_score
201        queen      0.682354
162          her      0.683995
417      stepson      0.686814
409      sisters      0.699692
680          she      0.706431
350     princess      0.719736
76           mom      0.732802
344        women      0.758530
702  policewoman      0.816518
489      husband      0.950914


In [20]:
df.sort_values(by="gender_score", inplace=True, ascending=False)
print(df.head(10))

            word  gender_score
489      husband      0.950914
702  policewoman      0.816518
344        women      0.758530
76           mom      0.732802
350     princess      0.719736
680          she      0.706431
409      sisters      0.699692
417      stepson      0.686814
162          her      0.683995
201        queen      0.682354


In [21]:
analogies[:5]

[['Athens', 'Greece', 'Baghdad', 'Iraq'],
 ['Athens', 'Greece', 'Bangkok', 'Thailand'],
 ['Athens', 'Greece', 'Beijing', 'China'],
 ['Athens', 'Greece', 'Berlin', 'Germany'],
 ['Athens', 'Greece', 'Bern', 'Switzerland']]

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import gensim

class Predictor(nn.Module):
    def __init__(self, embedding_size):
        super(Predictor, self).__init__()

        self.fc1 = nn.Linear(embedding_size, embedding_size)

    def forward(self, x):
        identity = x
        bias = self.fc1(x)
        out = identity - bias
        return out

class Adversary(nn.Module):
    def __init__(self, embedding_size):
        super(Adversary, self).__init__()

        self.fc1 = nn.Linear(embedding_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x

class AdversarialModel(nn.Module):
    def __init__(self, embedding_size, gender_direction, v, y, z, word2vec, device):
        super(AdversarialModel, self).__init__()

        # Global Settings and Hyperparameters
        self.num_epochs = 10000
        self.device = device
        self.adversary_lr = 0.01
        self.predictor_lr = 0.01

        # Dataset
        self.v = v.to(device)
        self.y = v.to(device)
        self.z = z.to(device)

        # Variables
        self.gender_direction = gender_direction.to(device)

        # Layers
        self.predictor = Predictor(embedding_size).to(device)
        self.adversary = Adversary(embedding_size).to(device)

        # Loss Criterion
        self.adversary_criterion = nn.MSELoss().to(device)
        self.predictor_criterion = nn.MSELoss().to(device)

        # Optimizers
        self.adversary_optim = optim.Adam(self.adversary.parameters(), lr=1e-1)
        self.predictor_optim = optim.Adam(self.predictor.parameters(), lr=1.5e-5)

        # WORD2VEC Model
        self.word2vec = word2vec

    def adversary_forward(self, x):
        x = self.predictor(x) # Predict the 4th word
        x = self.adversary(x) # Predict the gender direction based on the 4th word
        return x

    def predictor_forward(self, x):
        x = self.predictor(x)
        return x

    def adversary_loss(self, predicted_gender, gender_direction):
        return self.adversary_criterion(predicted_gender, gender_direction)

    def predictor_loss(self, predicted_d, d):
        return self.predictor_criterion(predicted_d, d)

    def train(self):
        # Always turn on train mode
        self.predictor.train()
        self.adversary.train()

        # Prepare Dataset
        v = self.v
        y = self.y
        z = self.z.float()

        for i in range(self.num_epochs):
            # Adversary Training
            self.adversary_optim.zero_grad()        # Zero-out gradients
            z_hat = self.adversary_forward(v)       # Adversary Forward Pass
            La = self.adversary_loss(z_hat, z)      # Adversary Loss
            La.backward()                           # Backprop
            self.adversary_optim.step()             # Gradient Descent

            # Predictor Training
            self.predictor_optim.zero_grad()        # Zero-out gradients
            y_hat = self.predictor_forward(v)       # Predictor Forward Pass
            Lp = self.predictor_loss(y_hat, y)      # Predictor Loss
            
            z_hat_p = self.adversary_forward(y_hat) # Predictor -> Adversary Forward Pass
            Lp_a = self.adversary_loss(z_hat_p, z)  # Adversary Loss
            Lp_total = 300 * Lp - Lp_a                     # Total Predictor Loss
            Lp_total.backward()                     # Backprop
            self.predictor_optim.step()             # Gradient Descent

            # Print Losses
            if ((i == 0) or ((i+1) % 50 == 0)):
                print("La: {} - Lp_p: {} - Lp_a: {} - Lp_total: {}".format(La.item(), Lp.item() * 300, Lp_a.item(), Lp_total.item()))
    
    def analogy(self, A, B, C):
        # Load the embeddings of each word
        A_vec = self.word2vec.word_vec(A)
        B_vec = self.word2vec.word_vec(B)
        C_vec = self.word2vec.word_vec(C)
        
        v = C_vec - A_vec + B_vec
        v = torch.from_numpy(v).to(self.device)

        self.predictor.eval()
        with torch.no_grad():
            pred_d = self.predictor(v)

        pred_d = pred_d.cpu().numpy()

        def print_knn(embeddings, v, k):
            print("{} closest neighbors to C - A + B".format(k))
            for neighbor, score in embeddings.similar_by_vector(v.flatten().astype(float), topn=k):
                print("{} : score={}".format(neighbor, score))

        print_knn(self.word2vec, pred_d, k=10)
        return

In [23]:
# Word analogy -> A : B :: C : D
# Input to the Adversarial Training is predicted D = C - A + B
# Label is D
def process_sample(analogy, embed, index_map, gender_direction):
    A, B, C, D = analogy
    A = embed[index_map[A]]
    B = embed[index_map[B]]
    C = embed[index_map[C]]
    D = embed[index_map[D]]
    gender_proj = np.dot(D, gender_direction)
    return  C-A+B, D, gender_proj

# Add analogy entries
v_train = []
y = []
z = []
gender_direction_normed = normalize_vector(gender_direction)
for analogy in analogies:
    v_sample, y_sample, z_sample = process_sample(analogy, embeddings, index_map, gender_direction_normed)
    v_train.append(v_sample)
    y.append(y_sample)
    z.append(z_sample)

In [24]:
# Dataset
v_train = torch.from_numpy(np.array(v_train))
y = torch.from_numpy(np.array(y))
z = torch.from_numpy(np.array(z)).view(-1, 1)

In [25]:
print(v_train.shape)
print(y.shape)
print(z.shape)

torch.Size([19544, 300])
torch.Size([19544, 300])
torch.Size([19544, 1])


In [26]:
# Instantiate Adversarial Model
device = ("cuda" if torch.cuda.is_available() else "cpu")
embedding_size = client.vector_size
gender_torch = torch.from_numpy(gender_direction)
model = AdversarialModel(embedding_size, gender_torch, v_train, y, z, client, device)

In [None]:
model.train()

La: 0.006924546789377928 - Lp_p: 0.8360947715118527 - Lp_a: 1.493820071220398 - Lp_total: -0.6577252745628357
La: 0.0036067701876163483 - Lp_p: 0.8076941594481468 - Lp_a: 0.0027467445470392704 - Lp_total: 0.8049473762512207
La: 0.0013885240769013762 - Lp_p: 0.7583159487694502 - Lp_a: 0.002327305730432272 - Lp_total: 0.755988597869873
La: 0.0013095267349854112 - Lp_p: 0.7097869645804167 - Lp_a: 0.002500282134860754 - Lp_total: 0.7072866559028625
La: 0.0012843497097492218 - Lp_p: 0.6644613342359662 - Lp_a: 0.0027361956890672445 - Lp_total: 0.6617251038551331
La: 0.001273082452826202 - Lp_p: 0.6229062331840396 - Lp_a: 0.0029474797192960978 - Lp_total: 0.6199586987495422
La: 0.0012677587801590562 - Lp_p: 0.585155573207885 - Lp_a: 0.0031176512129604816 - Lp_total: 0.5820378661155701
La: 0.0012651649303734303 - Lp_p: 0.5510137067176402 - Lp_a: 0.0032546138390898705 - Lp_total: 0.5477591156959534
La: 0.0012638868065550923 - Lp_p: 0.5201873718760908 - Lp_a: 0.0033698228653520346 - Lp_total: 0.

La: 0.001262545003555715 - Lp_p: 0.06922089960426092 - Lp_a: 0.008241433650255203 - Lp_total: 0.06097946688532829
La: 0.001262545119971037 - Lp_p: 0.06707058346364647 - Lp_a: 0.008069878444075584 - Lp_total: 0.05900070071220398
La: 0.0017082135891541839 - Lp_p: 0.0648549321340397 - Lp_a: 0.007538123521953821 - Lp_total: 0.05731680989265442
La: 0.0012764051789417863 - Lp_p: 0.06280556990532205 - Lp_a: 0.007547618355602026 - Lp_total: 0.05525795370340347
La: 0.0012625723611563444 - Lp_p: 0.060914154164493084 - Lp_a: 0.007522429805248976 - Lp_total: 0.053391724824905396
La: 0.001262545003555715 - Lp_p: 0.05908080056542531 - Lp_a: 0.007416368927806616 - Lp_total: 0.051664434373378754
La: 0.001262545003555715 - Lp_p: 0.05730918928747997 - Lp_a: 0.007318942341953516 - Lp_total: 0.04999024420976639
La: 0.001262545003555715 - Lp_p: 0.05559974815696478 - Lp_a: 0.007230608724057674 - Lp_total: 0.048369139432907104
La: 0.001262545003555715 - Lp_p: 0.053951206791680306 - Lp_a: 0.007151687517762184

# Word Analogy with Word2vec (reduced bias)

In [None]:
model.analogy("man", "woman", "friend")

# Word Analogy with Word2vec (original)

In [None]:
A = "man"
B = "woman"
C = "friend"
A_vec = client.word_vec(A)
B_vec = client.word_vec(B)
C_vec = client.word_vec(C)
D_vec = C_vec - A_vec + B_vec

print_knn(client, D_vec, NUM_ANALOGIES)