In [31]:
import torch
from torch import nn,optim
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import os 
main_dir = os.path.join("..","input")

In [3]:
with open(os.path.join(main_dir,'text8')) as f:
    data=f.read()

In [4]:
from collections import Counter
def preprocess(text):
    text=text.lower()
    text=text.replace('.','<PERIOD>')
    text=text.replace('""','<QUOTATION_MARK>')
    text=text.replace(',','<COMMA>')
    text=text.replace(';','<SEMICOLON>')
    text=text.replace('!','<EXCLAMATION_MARK>')
    text=text.replace('?','<QUESTION_MARK>')
    text=text.replace('(','<LEFT_PAREN>')
    text=text.replace(')','<RIGHT_PAREN>')
    text=text.replace('-','<HYPHEN>')
    text=text.replace(':','<COLON>')
    words=text.split()
    
    word_count=Counter(words)
    trimmed_words=[word for word in words if word_count[word]>5]
    return trimmed_words

In [5]:
words=preprocess(data)

In [6]:
def create_lookup_tables(words):
    word_counts=Counter(words)
    
    sorted_vocab=sorted(word_counts, key=word_counts.get, reverse=True)
    
    int2vocab={i:word for word,i in enumerate(sorted_vocab)}
    
    vocab2int={word:i for i,word in int2vocab.items()}
    
    return int2vocab, vocab2int

In [7]:
vocab_to_int , int_to_vocab = create_lookup_tables(words)

In [8]:
int_words = [vocab_to_int[word] for word in words]
print(int_words[:30])

[5233, 3080, 11, 5, 194, 1, 3133, 45, 58, 155, 127, 741, 476, 10571, 133, 0, 27349, 1, 0, 102, 854, 2, 0, 15067, 58112, 1, 0, 150, 854, 3580]


SubSampling

In [9]:
import random
threshold=1e-5
total_count = len(int_words)
print(total_count)
word_counts=Counter(int_words)
frequency = {word:count/total_count for word,count in word_counts.items()}
p_drop = {word:1-np.sqrt(threshold/frequency[word]) for word in word_counts}
train_words = [word for word in int_words if (1-p_drop[word])>0.1]

16680599


In [10]:
len(train_words)

9032964

In [11]:
print(train_words[:30])

[5233, 3080, 194, 3133, 155, 127, 741, 476, 10571, 133, 27349, 102, 854, 15067, 58112, 150, 854, 3580, 194, 190, 10712, 214, 1324, 104, 454, 2731, 362, 3672, 708, 371]


Get Batches

In [18]:
def get_target(words,idx, window_size):
    R=np.random.randint(1,window_size+1)
    start=idx-R if idx-R>0 else 0 
    stop=idx+R
    target=words[start:idx]+words[idx+1:stop+1]
    return target
get_target(train_words,5,5)

[5233, 3080, 194, 3133, 155, 741, 476, 10571, 133, 27349]

In [30]:
def get_batches(words, batch_size, window_size):
    n_batches = len(words)//batch_size
    
    words=words[:n_batches*batch_size]
    for idx in range(0,len(words),batch_size):
            x=[]
            y=[]
            batch=words[idx:idx+batch_size]
            for i in range(len(batch)):
                batch_x=batch[i]
                batch_y=get_target(batch,i,window_size)
                x.extend([batch_x]*len(batch_y))
                y.extend(batch_y)
            yield x,y
            
                
                
x,y=next(get_batches(train_words, 4, 5))
print(x)
print(y)

[5233, 3080, 3080, 3080, 194, 194, 194, 3133]
[3080, 5233, 194, 3133, 5233, 3080, 3133, 194]


In [42]:
def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """
    
    # Here we're calculating the cosine similarity between some random words and 
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.
    
    # sim = (a . b) / |a||b|
    
    embed_vectors = embedding.weight
    
    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples,
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)
    
    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
        
    return valid_examples, similarities

In [34]:
class SkipGram(nn.Module):
    def __init__(self,vocab,embed):
        super().__init__()
        self.embed=nn.Embedding(vocab,embed)
        self.out=nn.Linear(embed,vocab)
        self.log_soft=nn.LogSoftmax(dim=1)
    def forward(self,x):
        x=self.embed(x)
        x=self.out(x)
        x=self.log_soft(x)
        return x

In [38]:
model=SkipGram(len(vocab_to_int),300)
criterion=nn.NLLLoss()
optimizer=optim.Adam(model.parameters(),lr=0.003)
model.cuda()

SkipGram(
  (embed): Embedding(63641, 300)
  (out): Linear(in_features=300, out_features=63641, bias=True)
  (log_soft): LogSoftmax()
)

In [None]:
epochs=5
print_every=500
steps=0
for e in range(epochs):
    for inputs,targets in get_batches(train_words,512,5):
        steps+=1
        inputs , targets = torch.LongTensor(inputs) , torch.LongTensor(targets)
        inputs , targets = inputs.cuda() , targets.cuda()
        logits = model(inputs)
        loss = criterion(logits , targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if steps%print_every==0:
            valid_examples, valid_similarities = cosine_similarity(model.embed, device='cuda')
            _, closest_idxs = valid_similarities.topk(6) # topk highest similarities
            
            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...")

will | towering, ewald, gelder, wanderings, pete
after | gemayel, nida, corriere, rabbits, parisian
time | geometrically, codons, downstream, assembl, bends
new | curium, sense, vacant, blight, groton
of | seclusion, targum, kanu, monosodium, detonates
state | sepp, nyc, medline, anjiro, viennese
between | forint, arrhenius, clem, perfumery, weddell
many | acc, informatics, becket, jefferson, doctor
writers | pricking, galahad, weather, joshi, takers
taking | streaming, yonkers, turpin, teshubah, fostered
shown | receivers, despair, falcone, yuen, polytheism
question | ballets, rantissi, ningen, mouthwash, bembo
governor | dashes, eph, cdc, catalog, clairvaux
gold | neonatal, takeo, sarti, nano, obelix
san | votive, accompanied, schemes, unscrupulous, traps
except | mink, kablooie, transducer, talked, bonuses
...
that | crewmembers, multihulls, chico, technical, broadsheet
one | auditioning, ouster, approx, dignity, blumenfeld
called | medications, viennese, ioannes, theorized, atomist

time | codons, geometrically, ibrd, assembl, downstream
that | crewmembers, multihulls, technical, bight, broadsheet
first | activision, solemn, hacker, amiable, active
a | oued, minardi, lilitu, averaged, forte
than | diverted, boac, geckos, francisco, excretion
state | sepp, anjiro, mdi, nyc, canneries
use | antibiotic, dominatrices, mobilization, willoughby, inducing
at | recoilless, eriksson, fabricated, bankhead, transformational
http | planks, subservience, issued, htm, ripper
pre | torts, censor, ems, heckel, fates
cost | vehicles, naturally, knapsack, sixpence, marjoram
magazine | stephen, tonkin, flanders, udeac, youth
prince | brugge, saracens, manpower, john, feinstein
award | repertoire, nerva, alger, efta, times
grand | middle, sativa, mistletoe, keywords, cimmeria
applications | batch, lukewarm, aids, levelled, guides
...
system | baffled, cameroons, offset, plexus, usefully
have | prior, disturbances, sundarbans, ddd, phenol
and | desiderius, shatz, wieser, cara, ufa
som

who | mutation, belgaum, morphogenesis, decontamination, euratom
were | philologie, glimpsed, tissues, janis, kinetochores
there | elstree, contacting, increments, curly, controls
can | nicaea, due, balmaceda, prolog, coeur
of | seclusion, targum, kanu, taylor, up
used | intersexual, circulating, saxophone, hypergiant, stolen
which | filipino, pickwick, trouble, replicate, tcl
also | salla, ourselves, hernandez, directions, motilal
pope | lateran, jj, menezes, bhagavad, council
egypt | laden, dogs, wentworth, venetian, peace
primarily | selection, epilepsy, upper, islamiyah, hateful
brother | jo, spend, confess, alai, salvius
behind | ineffectual, narration, rationalistic, kanda, reversed
additional | uncompressed, constructor, dimensional, muscovy, convened
engine | powerplant, spacesuit, dumping, repeaters, formats
articles | romance, performa, fao, vcs, dreamworks
...
about | approximately, roughly, chicanos, comical, around
which | replicate, pickwick, trouble, filipino, tcl
also |

which | replicate, trouble, filipino, pickwick, tcl
there | elstree, contacting, controls, increments, bragging
nine | aime, nevsky, triassic, oeis, usages
that | crewmembers, multihulls, chico, bight, collaborations
d | fairness, visuals, ucd, tempel, raincoat
an | obe, geddes, creator, aleksandrovich, altos
three | sphingomyelin, nutshell, brink, viswanathan, angers
called | referred, known, named, based, considered
something | what, you, questions, gemeinde, really
animals | species, humans, plants, human, diseases
scale | number, leafy, tdma, rosemont, demeaning
paris | france, switzerland, kitchener, hypatia, de
ice | hollow, goeldi, headmaster, varna, ribs
derived | latin, name, called, abbreviation, emptiness
proposed | notified, nowell, muscovy, rediscovery, caress
square | area, km, walls, arbitron, higham
...
american | samantha, tavola, illustrators, approximating, trivium
often | sometimes, usually, generally, frequently, simply
new | curium, realms, groton, neuromancer, jo

states | nations, kingdom, u, countries, united
who | storing, morphogenesis, subgroups, pva, mutation
or | emphasis, marder, exiguus, capitalization, alcs
are | incisors, width, proffered, anh, hagar
people | deaths, births, americans, natives, things
over | under, nearly, every, off, between
is | enjoyable, authenticate, interphase, yearning, sure
was | vaud, shalom, cupbearer, antiparticle, czapski
accepted | regarded, considered, understandings, although, historians
event | events, obese, slams, celebrations, peltier
additional | uncompressed, triple, adequate, muscovy, galatasaray
derived | latin, word, means, etymology, root
san | francisco, los, california, de, di
shows | show, series, movies, description, pdl
running | run, narrow, runs, drilled, subjection
road | roads, rail, routes, mimicking, touma
...
one | auditioning, ouster, dignity, approx, gland
its | neq, films, irritated, naka, yarmuk
this | hydrate, klima, insurance, szabo, emirates
during | end, early, before, thro