In [0]:
import os;os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2/pretrained_data')

In [0]:
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import expit as sigmoid
from sklearn.utils import shuffle
from datetime import datetime

from scipy.spatial.distance import cosine as cos_dist
from sklearn.metrics.pairwise import pairwise_distances

from glob import glob

import os
import sys
import string
import pdb

# import os; os.chdir('/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2')
# from brown import get_sentences_with_word2idx_limit_vocab as get_brown
savedir = '/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2/pretrained_data'

In [0]:
def remove_punctuation(s):
    return s.translate(str.maketrans('','', string.punctuation))

In [0]:
def get_wiki():

    V = 20000

    wiki_data_location = '/content/drive/My Drive/Colab Notebooks/Lazy courses/NLP2/wiki_data'
    os.chdir(wiki_data_location)
    
    files = [f for f in os.listdir(wiki_data_location) if f.startswith('enwiki') and f.endswith('txt')]

    #pdb.set_trace()

    #files = glob('enwiki*.txt')
    all_word_counts = {}

    for f in files:
        for line in open(f):
            if line and line[0] not in '[*-|=\{\}':
                s = remove_punctuation(line).lower().split()
                if len(s) > 1:
                    for word in s:
                        if word not in all_word_counts:
                            all_word_counts[word] = 0
                        all_word_counts[word] += 1
    print("Finished counting..")

    V = min(V, len(all_word_counts))
    all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True)

    top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>']
    word2idx = {w:i for i,w in enumerate(top_words)}
    unk = word2idx['<UNK>']

    sents = []
    for f in files:
        for line in open(f):
            if line and line[0] not in '[*-|=\{\}':
                s = remove_punctuation(line).lower().split()
                if len(s) > 1:
                    #if a word is not nearby another word, 
                    #there won't be any context, and hence nothing to train!
                    sent = [word2idx[w] if w in word2idx else unk for w in s]
                    sents.append(sent)

    return sents, word2idx

In [0]:
def get_negative_sampling_distribution(sentences, vocab_size):
    #Pn(w) = probability of word occuring
    #we would like to sample the negative samples
    #such that words that occur more often 
    #should be sampled more often

    word_freq = np.zeros(vocab_size)
    word_count = sum(len(sentence) for sentence in sentences)
    for sentence in sentences:
        for word in sentence:
            word_freq[word] += 1

    #smoothen 
    p_neg = word_freq**0.75

    #normalize
    p_neg = p_neg / p_neg.sum()

    # ####### DEBUG
    # print("###")
    # print(vocab_size)
    # #print(np.all(p_neg > 0))
    # print("###")
    # #######

    assert(np.all(p_neg > 0))
    return p_neg

In [0]:
def get_context(pos, sentence, window_size):
    #input:
    #a sentence of the form: x x x x c c c pos c c c x x x x 
    #output:
    #the context word indices: c c c c c c

    start = max(0, pos - window_size)
    end_ = min(len(sentence), pos + window_size) 

    context = []
    for ctx_pos, ctx_word_idx in enumerate(sentence[start:end_], start=start):
        if ctx_pos != pos:
            #don't include the input word itself as a target
            context.append(ctx_word_idx)
    
    return context

In [0]:
def sgd(input_, targets, label, learning_rate, W, V):
    # W[input_] shape: D
    # V[;, targets] shape DxN
    #activation shape: N
    #print("input_", input_, "targets", targets)
    activation = W[input_].dot(V[:, targets])
    prob = sigmoid(activation)

    #gradients
    gV = np.outer(W[input_], prob - label) #shape--> DxN
    gW = np.sum((prob - label) * V[:, targets], axis=1) #shape--> D

    V[:, targets] -= learning_rate * gV #shape--> D x N
    W[input_] -= learning_rate * gW #shape--> D

    #return cost --> binary cross-entropy
    cost = label * np.log(prob + 1e-10) + (1 - label) * np.log(1 - prob + 1e-10)
    return cost.sum()

In [0]:
def train_model(savedir):
    #get the data
    sentences, word2idx = get_wiki()

    #debug
    #pdb.set_trace()


    #number of unique words
    vocab_size = len(word2idx)

    #config
    window_size = 5
    learning_rate = 0.025
    final_learning_rate = 0.0001
    num_negatives = 5 # number of negatives samples to draw per input word
    epochs = 20
    D = 50 # word embedding size 

    #learning rate decay
    learning_rate_delta = (learning_rate -final_learning_rate)/epochs

    #params 
    W = np.random.randn(vocab_size, D) #input layer to hidden layer
    V = np.random.randn(D, vocab_size) #hidden to output layer

    #distribution for drawing negative samples
    p_neg = get_negative_sampling_distribution(sentences, vocab_size)

    #save the costs to plot them per iteration
    costs = []

    #number of total words in corpus
    total_words = sum(len(sentence) for sentence in sentences)
    print("Total number of words in corpus:", total_words)

    #for subsampling each sentence
    threshold = 1e-5
    p_drop = 1 - np.sqrt(threshold / p_neg)

    #train the model
    for epoch in range(epochs):
        
        #shuffle the sentence order
        np.random.shuffle(sentences)

        #accumulate the cost
        cost = 0 
        counter = 0
        t0 = datetime.now()

        for sentence in sentences:
            #keep only certain words based on p_neg
            sentence = [w for w in sentence if np.random.random() < (1-p_drop[w])]

            if len(sentence) < 2:
                continue
            
            #randomly order words so we don't always 
            #see samples in the same order
            randomly_ordered_positions = np.random.choice(
                len(sentence), size=len(sentence), replace=False,
            )

            for pos in randomly_ordered_positions:
                #the middle word
                word = sentence[pos]

                #get the positive context words and negative samples
                context_words = get_context(pos, sentence, window_size)
                neg_word = np.random.choice(vocab_size, p=p_neg)
                targets = np.array(context_words)

                #do one iteration of stochastic gradient descent
                c = sgd(word, targets, 1, learning_rate, W, V)
                cost += c
                c = sgd(neg_word, targets, 0, learning_rate, W, V)
                cost += c

            counter += 1

            if counter % 5000 == 0:
                sys.stdout.write(f"Processed {counter} / {len(sentences)} \n")
                sys.stdout.flush()
            
        dt = datetime.now() - t0
        print('epoch complete:', epoch, "cost:", cost, "dt:", dt)

        #save the cost
        costs.append(cost)

        #update the learning rate
        learning_rate -= learning_rate_delta

    #plot the cost per iteration
    plt.plot(costs)
    plt.xlabel('Epochs')
    plt.ylabel('Costs')
    plt.show()

    #Save the model
    with open(f'{savedir}/word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    np.savez(f"{savedir}/weights.npz", W, V)

    #return the model 
    return word2idx, W, V

In [0]:
def load_model():
    with open(f"{savedir}/word2idx.json") as f:
        word2idx = json.load(f)
    npz = np.load(f'{savedir}/weights.npz')
    W = npz['arr_0']
    V = npz['arr_1']
    return word2idx, W, V

In [0]:
def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word, W):
    V, D = W.shape

    #pos2 not used in calculation, just printing the expected value
    print(f"testing {pos1} - {neg1} = {pos2} - {neg2}")
    for w in (pos1, neg1, pos2, neg2):
        if w not in word2idx:
            print(f"Sorry {w} is not word2idx")
            return

    p1 = W[word2idx[pos1]]
    n1 = W[word2idx[neg1]]
    p2 = W[word2idx[pos2]]
    n2 = W[word2idx[neg2]]

    vec = p1 - n1 + n2

    distances = pairwise_distances(vec.reshape(1, D), W, metric='cosine').reshape(V)
    idx = distances.argsort()[:10]

    #pick one that's not p1, n1, or n2
    best_idx = -1
    keep_out = [wordidx[w] for w in (pos1, neg1, neg2)]
    #print("Keep_out", keep_out)
    for i in idx:
        if i not in keep_out:
            best_idx = i
            break
    # print("best_idx",best_idx)

    print(f"got: {pos1} - {neg1} = {idx2word[best_idx]} - {neg2}")
    print("closest 10:")
    for i in idx:
        print(idx2word[i], distances[i], '\n')

    print(f"Distance to {pos2}: {cos_dist(p2, vec)}")



In [0]:
def test_model(word2idx, W, V):
    #there are multiple ways to get the "final" word embedding
    #We = (W + V.T) / 2
    #We = W
    idx2word = {i:w for w, i in word2idx.items()}

    for We in (W, (W + V.T) / 2):
        print("*" * 30)
        
        analogy('king', 'man', 'queen', 'woman', word2idx, idx2word, We)
        analogy('king', 'prince', 'queen', 'princess', word2idx, idx2word, We)
        analogy('miami', 'florida', 'dallas', 'texas', word2idx, idx2word, We)
        analogy('einstein', 'scientist', 'picasso', 'painter', word2idx, idx2word, We)
        analogy('japan', 'sushi', 'germany', 'bratwurst', word2idx, idx2word, We)
        analogy('man', 'woman', 'he', 'she', word2idx, idx2word, We)
        analogy('man', 'woman', 'uncle', 'aunt', word2idx, idx2word, We)
        analogy('man', 'woman', 'brother', 'sister', word2idx, idx2word, We)
        analogy('man', 'woman', 'husband', 'wife', word2idx, idx2word, We)
        analogy('man', 'woman', 'actor', 'actress', word2idx, idx2word, We)
        analogy('man', 'woman', 'father', 'mother', word2idx, idx2word, We)
        analogy('heir', 'heiress', 'prince', 'princess', word2idx, idx2word, We)
        analogy('nephew', 'niece', 'uncle', 'aunt', word2idx, idx2word, We)
        analogy('france', 'paris', 'japan', 'tokyo', word2idx, idx2word, We)
        analogy('france', 'paris', 'china', 'beijing', word2idx, idx2word, We)
        analogy('february', 'january', 'december', 'november', word2idx, idx2word, We)
        analogy('france', 'paris', 'germany', 'berlin', word2idx, idx2word, We)
        analogy('week', 'day', 'year', 'month', word2idx, idx2word, We)
        analogy('week', 'day', 'hour', 'minute', word2idx, idx2word, We)
        analogy('france', 'paris', 'italy', 'rome', word2idx, idx2word, We)
        analogy('paris', 'france', 'rome', 'italy', word2idx, idx2word, We)
        analogy('france', 'french', 'england', 'english', word2idx, idx2word, We)
        analogy('japan', 'japanese', 'china', 'chinese', word2idx, idx2word, We)
        analogy('china', 'chinese', 'america', 'american', word2idx, idx2word, We)
        analogy('japan', 'japanese', 'italy', 'italian', word2idx, idx2word, We)
        analogy('japan', 'japanese', 'australia', 'australian', word2idx, idx2word, We)
        analogy('walk', 'walking', 'swim', 'swimming', word2idx, idx2word, We)

In [0]:
if __name__ == '__main__':
    word2idx, W, V = train_model(savedir)
    test_model(word2idx, W, V)

Finished counting..
Total number of words in corpus: 86478677
Processed 5000 / 1271558 
Processed 10000 / 1271558 
Processed 15000 / 1271558 
Processed 20000 / 1271558 
Processed 25000 / 1271558 
Processed 30000 / 1271558 
Processed 35000 / 1271558 
Processed 40000 / 1271558 
Processed 45000 / 1271558 
Processed 50000 / 1271558 
Processed 55000 / 1271558 
Processed 60000 / 1271558 
Processed 65000 / 1271558 
Processed 70000 / 1271558 
Processed 75000 / 1271558 
Processed 80000 / 1271558 
Processed 85000 / 1271558 
Processed 90000 / 1271558 
Processed 95000 / 1271558 
Processed 100000 / 1271558 
Processed 105000 / 1271558 
Processed 110000 / 1271558 
Processed 115000 / 1271558 
Processed 120000 / 1271558 
Processed 125000 / 1271558 
Processed 130000 / 1271558 
Processed 135000 / 1271558 
Processed 140000 / 1271558 
Processed 145000 / 1271558 
Processed 150000 / 1271558 
Processed 155000 / 1271558 
Processed 160000 / 1271558 
Processed 165000 / 1271558 
Processed 170000 / 1271558 
Proces