In [5]:
import torch
import torch.nn as nn
import torch.utils.data as tud
from collections import Counter
import numpy as np
import random
from torch.utils.tensorboard import SummaryWriter
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import urllib.request
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

torch.manual_seed(1)

<torch._C.Generator at 0x106daff70>

In [3]:
def read_data(file_path):
    tokenizer = RegexpTokenizer(r'\w+')
    data = urllib.request.urlopen(file_path)
    data = data.read().decode('utf8')
    tokenized_data = word_tokenize(data)
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.',',',':',';','(',')','#','--','...','"'])
    cleaned_words = [ i for i in tokenized_data if i not in stop_words ]
    return(cleaned_words)

In [28]:
def get_words(words):

    words_set = list(set(words))

    words_freq_num = Counter(words).most_common(len(words_set))
    words_freq_p = [freq for word, freq in words_freq_num]
    words_freq_p = np.array(words_freq_p)
    words_freq_p = words_freq_p ** (3. / 4.)
    words_freq_p = words_freq_p / np.sum(words_freq_p)

    words2id = {w: i for i, w in enumerate(words_set)}
    id2words = {i: w for i, w in enumerate(words_set)}

    return words2id, id2words, words_set, words, np.array(words_freq_num), np.array(words_freq_p)

In [6]:
test_sentence = read_data('https://www.gutenberg.org/files/57884/57884-0.txt')

In [35]:
def get_skip_pairs(words, context_size):
    skip_grams = []
    for i, w in enumerate(words):
        context_words = words[max(i - context_size, 0):max(i, 0)] + words[i + 1:i + context_size + 1]
        skip_grams.append((w, context_words))
    return skip_grams

In [8]:
test_sentence[:10]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'EBook',
 'The',
 'Little',
 'Moment',
 'Happiness',
 'Clarence',
 'Budington']

In [9]:
class word_embedding_dataset(tud.Dataset):
    def __init__(self, skip_grams):
        super(word_embedding_dataset, self).__init__()
        self.skip_grams = skip_grams

    def __len__(self):
        return len(self.skip_grams)

    def __getitem__(self, idx):
        return self.skip_grams[idx]

In [11]:
def find_nearest_k(word, k):
    wid = words2id[word]
    w_vec = wordvec[wid]

    similarity = wordvec @ w_vec.T
    sort = np.sort(similarity)[::-1]
    sort_arg = np.argsort(similarity)[::-1]

    result = []
    for i in sort_arg:
        result.append(id2words[i])

    return result[:k]

In [50]:
class embedding_model(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(embedding_model, self).__init__()
        self.voc_size = voc_size
        self.emb_size = emb_size

        init_range = 0.5 / self.emb_size
        self.in_embed = nn.Embedding(num_embeddings=self.voc_size, embedding_dim=emb_size)
        self.in_embed.weight.data.uniform_(-init_range, init_range)
        self.out_embed = nn.Embedding(num_embeddings=self.voc_size, embedding_dim=emb_size)
        self.out_embed.weight.data.uniform_(-init_range, init_range)
        
    
    def forward(self, input_labels, pos_labels, neg_labels):
        
        # shape (1,embedding_size)
        input_embedding = self.in_embed(input_labels)
        # shape (context_size,embedding_size)
        
        pos_embedding = self.out_embed(pos_labels)
        neg_embedding = self.out_embed(neg_labels)

        input_embedding = input_embedding.unsqueeze(2)
        pos_embedding = pos_embedding.unsqueeze(0)
        neg_embedding = neg_embedding.unsqueeze(0)

        pos_dot = torch.bmm(pos_embedding, input_embedding)
        neg_dot = torch.bmm(neg_embedding, -input_embedding)

        # 正样本
        log_pos = torch.sigmoid(pos_dot).sum(1)
        # 负样本
        log_neg = torch.sigmoid(neg_dot).sum(1)

        loss = (-log_pos - log_neg).squeeze()
        return loss
        
        
        

In [51]:
def words2id_func(words):
    return np.array([words2id[w] for w in words])

# Main function 

In [52]:
k = 5
embedding_size = 8
context_size = 2
lr = 1e-2
num_epoch = 800

In [53]:
model = embedding_model(voc_size=voc_size, emb_size=embedding_size)

In [29]:
words2id, id2words, words_set, words, words_freq_num, words_freq_p = get_words(test_sentence)

In [30]:
voc_size = len(words_set)

In [33]:
words_freq_p

array([1.13954053e-02, 1.13954053e-02, 9.54439976e-03, ...,
       2.97322554e-05, 2.97322554e-05, 2.97322554e-05])

In [36]:
skip_grams = get_skip_pairs(test_sentence, context_size=2)

In [39]:
dataset = word_embedding_dataset(skip_grams=skip_grams)

In [41]:
words_freq_p

array([1.13954053e-02, 1.13954053e-02, 9.54439976e-03, ...,
       2.97322554e-05, 2.97322554e-05, 2.97322554e-05])

In [45]:
np.random.multinomial(20, [1/6.]*6, size=1)

array([[2, 5, 3, 5, 1, 4]])

In [46]:
indexs = torch.multinomial(torch.Tensor(words_freq_p), k, replacement=True)

In [54]:
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [57]:
word_freq_indexs = {}
for i, (word, freq) in enumerate(words_freq_num):
    word_freq_indexs[word] = i

In [109]:
index = 0

In [113]:
for i, skip_gram in enumerate(dataset):
    center_word = words2id[skip_gram[0]]
    context_words_id = [words2id[word] for word in skip_gram[1]]
    context_words = skip_gram[1]
    p = words_freq_p.copy()
    
    for context in context_words:
        p[word_freq_indexs[context]] = 0
    
    #sampling from negative samples after removing the context words
    neg_words_sample = torch.multinomial(torch.Tensor(p), k, replacement=True)
    
    
    neg_words = words2id_func(words_freq_num[np.asarray(neg_words_sample.data.tolist())][:, 0])
    #print(context_words)
    context_words = words2id_func(context_words)
    #print(neg_words)
    #print(context_words)
    
    
    optimizer.zero_grad()
    
    
    loss = model(torch.LongTensor([center_word]), torch.LongTensor(context_words), torch.LongTensor(neg_words))
    break
    if index % 10000 == 0:
        print(loss.item())
    loss.backward()
    optimizer.step()
    index += 1

In [114]:
loss

tensor(-3.4995, grad_fn=<SqueezeBackward0>)

In [115]:
torch.LongTensor([center_word])

tensor([8092])

In [116]:
torch.LongTensor(context_words)

tensor([9460, 3448])

In [117]:
torch.LongTensor(neg_words)

tensor([11171,  8656,  4494,  6622,  2301])