## Word embedding

使用skip-gram模型训练词向量

In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data

from collections import Counter
import numpy as np
import random

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

import os

USE_CUDA = torch.cuda.is_available()

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

if USE_CUDA:
    torch.cuda.manual_seed(1)

In [85]:
# params
C = 3 # context window
K = 100 # num of negative samples
NUM_EPOCHS = 2
MAX_VOCAB_SIZE = 10000
BATCH_SIZE = 32
LEARNING_RATE = 0.02
EMBEDDING_SIZE = 100

SyntaxError: invalid syntax (<ipython-input-85-2a4fed5f9180>, line 8)

In [86]:
!pwd

/Users/xuming06/Codes/python-tutorial


In [87]:
with open('./data/nietzsche.txt', 'r') as f:
    text = f.read()
text = text.split()
print(len(text))
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))

idx_to_word = {idx:word for idx,word in enumerate(vocab.keys())}
word_to_idx = {word:i for i,word in idx_to_word.items()}

99111


In [88]:
list(idx_to_word.items())[:10]

[(0, 'the'),
 (1, 'of'),
 (2, 'and'),
 (3, 'to'),
 (4, 'in'),
 (5, 'a'),
 (6, 'is'),
 (7, 'that'),
 (8, 'as'),
 (9, 'it')]

In [89]:
list(word_to_idx.items())[:10]

[('the', 0),
 ('of', 1),
 ('and', 2),
 ('to', 3),
 ('in', 4),
 ('a', 5),
 ('is', 6),
 ('that', 7),
 ('as', 8),
 ('it', 9)]

In [102]:
word_counts = np.array([count for count in vocab.values()],dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs)
VOCAB_SIZE = len(idx_to_word)
VOCAB_SIZE

10000

## 使用Dataset创建一个自定义数据集

In [103]:
class WordEmbeddingDataset(data.Dataset):
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordEmbeddingDataset, self).__init__()
        self.text_encoded = [word_to_idx.get(word, word_to_idx["<unk>"]) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)
    
    def __getitem__(self, index):
        center_word = self.text_encoded[index]
        pos_indices = list(range(index - C)) + list(range(index + 1, index + C + 1))
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]

        pos_words = self.text_encoded[pos_indices]  # 周围单词
        neg_wrods = torch.multinomial(self.word_freqs, pos_words.shape[0], True)

        return center_word, pos_words, neg_wrods

In [104]:
dataset = WordEmbeddingDataset(text, word_to_idx, idx_to_word, word_freqs, word_counts)

dataloader = data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

In [105]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x12a754e48>

## 定义pytorch模型

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
        
    def forward(self, input_labels, pos_labels, neg_labels):
        # input_label: [batch_size]
        
        input_embedding = self.in_embed(input_labels)
        pos_embedding = self.in_embed(pos_labels)
        neg_embedding = self.in_embed(neg_labels)
        
        input_embedding = input_embedding.unsqueeze(2)
        pos_dot = torch.bmm(pos_embedding, input_embedding).squeeze()
        neg_dot = torch.bmm(neg_embedding, input_embedding).squeeze()
        
        log_pos = F.logsigmoid(pos_dot).sum(1)
        log_neg = F.logsigmoid(neg_dot).sum(1)
        
        loss = log_neg + log_pos
        return -loss
    
    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()

In [100]:
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)

In [101]:
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
for i in range(NUM_EPOCHS):
    for j,(input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        
        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print('epoch', i, 'iter', j, loss.item())

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/Users/xuming06/Library/Python/3.6/lib/python/site-packages/torch/utils/data/_utils/worker.py", line 178, in _worker_loop
    data = fetcher.fetch(index)
  File "/Users/xuming06/Library/Python/3.6/lib/python/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/Users/xuming06/Library/Python/3.6/lib/python/site-packages/torch/utils/data/_utils/collate.py", line 79, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/Users/xuming06/Library/Python/3.6/lib/python/site-packages/torch/utils/data/_utils/collate.py", line 79, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/Users/xuming06/Library/Python/3.6/lib/python/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 15324 and 98566 in dimension 1 at ../aten/src/TH/generic/THTensor.cpp:689
