In [1]:
import torch
import torch.nn as nn

import random
import pandas as pd
import numpy as np
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
C = 3
K = 100
MAX_VOCAB_SIZE = 30000
EMBEDDING_SIZE = 100  # 一般而言，2**EMBEDDIG_SIZE > MAX_VOCAB_SIZE

NUM_EPOCHS = 10
BATCH_SIZE = 64
LEARNING_RATE = 0.1

DEVICE = ("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

'cuda'

# BuildVocab

In [2]:
from collections import Counter

class BuildVocab(object):
    """build vocabilary based on text, 

    Args:
        :param: text: build vocabilary based on text
        :param: MAX_VOCAB_SIZE: the size of vocabilary 
        :param: EMBEDDIG_SIZE: embedding vocabilary to embedding_weights of EMBEDDIG_SIZE
        :return:
    """
    def __init__(self, MAX_VOCAB_SIZE,EMBEDDING_SIZE):
        self.VOCAB_SIZE = MAX_VOCAB_SIZE
        self.EMBEDDIG_SIZE = EMBEDDING_SIZE
        self.vocab = dict()
        self.idx_to_word = []
        self.word_to_idx = dict()

    def word_tokenize(self, text):
        return text.split()

    def build_vocab(self,text):
        text = [w for w in self.word_tokenize(text.lower())]
        self.vocab = dict(Counter(text).most_common(self.VOCAB_SIZE - 1)) # 统计常见词的词频
        self.vocab["<unk>"] = len(text) - np.sum(list(self.vocab.values()))  # 不常见的词都设为unk

        self.idx_to_word = [word for word in self.vocab.keys()]
        self.word_to_idx = {word:idx for idx, word in enumerate(self.idx_to_word)}

        self.VOCAB_SIZE = len(self.idx_to_word)

    def word_freqs(self):
        if len(self.vocab) == 0:
            raise ValueError("vocab no words")

        word_counts = np.array([count for count in self.vocab.values()])
        word_freqs = word_counts / np.sum(word_counts)
        word_freqs = word_freqs ** (3./4.)
        word_freqs = word_freqs / np.sum(word_freqs)
        return word_freqs

    

In [3]:
    train_file = "../data/text8/text8.train.txt"
    with open(train_file, "r") as fin:
        train_data = fin.read()

    vocab = BuildVocab(MAX_VOCAB_SIZE,EMBEDDING_SIZE)
    vocab.build_vocab(train_data) # 得到 vocab,word_to_idx,idx_to_word
    word_freqs = vocab.word_freqs()

    word_to_idx, idx_to_word = vocab.word_to_idx, vocab.idx_to_word
    VOCAB_SIZE = vocab.VOCAB_SIZE

# WordEmbeddingDataset

In [4]:
from torch.utils.data import Dataset, DataLoader
NUM_WORKERS = 2
class WordEmbeddingDataset(Dataset):
    """difine the dataset to training model 
    
    Arg:
        text: a list of words, all text from the training dataset
        word_to_idx: the dictionary from word to idx
        idx_to_word: idx to word mapping
        word_freqs: the frequency of each word
        word_counts: the word counts
        C: the window size of context form text
        K: numbers of multiple when sampling negative samples
        return: center_word, pos_words, neg_words
    """
    def __init__(self, text, VOCAB_SIZE, word_to_idx, idx_to_word, word_freqs, C, K):
        super(WordEmbeddingDataset, self).__init__()
        # self.text_encoded = [word_to_idx.get(word, word_to_idx["<unk>"]) for word in text]
        self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE-1) for t in text]

        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_freqs = torch.Tensor(word_freqs)
        
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        
        self.C = C
        self.K = K

    def __len__(self):
        return len(self.text_encoded)

    def __getitem__(self, idx):
        center_word = self.text_encoded[idx]
        pos_idx = list(range(idx-self.C, idx)) + list(range(idx+1, idx+self.C+1))
        pos_idx = [i%len(self.text_encoded) for i in pos_idx] # 1
        pos_words = self.text_encoded[pos_idx]
#         neg_idx = torch.multinomial(self.word_freqs, self.K*pos_words.shape[0])
#         neg_words = self.text_encoded[neg_idx]
        neg_words = torch.multinomial(self.word_freqs, self.K*pos_words.shape[0])
        return center_word, pos_words, neg_words
    

In [5]:
    dataset = WordEmbeddingDataset(train_data, VOCAB_SIZE, word_to_idx, idx_to_word, word_freqs, C, K)
    dataloader = DataLoader(dataset = dataset,
                            batch_size = BATCH_SIZE,
                            num_workers = NUM_WORKERS,
                            shuffle = True)

# EmbeddingModel

In [6]:
import torch.nn as nn
import torch.nn.functional as F

class EmbeddingModel(nn.Module):
    """Skip-gram model 
    """
    def __init__(self, vocab_size, embedding_size):
        super(EmbeddingModel,self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.in_embed = nn.Embedding(self.vocab_size, self.embedding_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embedding_size)
        # 模型参数初始化
        init_weight = 0.5 / self.embedding_size
        self.in_embed.weight.data.uniform_(-init_weight, init_weight)
        self.out_embed.weight.data.uniform_(-init_weight, init_weight)

    def forward(self, center_word, pos_words, neg_words):
        center_embedding = self.in_embed(center_word) # [batch_size, embedding_size]
        pos_embedding = self.out_embed(pos_words)     # [batch_size, 2C, embedding_size]
        neg_embedding = self.out_embed(neg_words)     # [batch_size, 2C*K, embedding_size]
        # unsqueeze(dim)  在第dim个维度增加一个维度
        center_unsqueeze = center_embedding.unsqueeze(2)      # [batch_size, embedding_size, 1]
        log_pos = torch.bmm(pos_embedding, center_unsqueeze)  # [batch_size, 2C, 1]
        log_neg = torch.bmm(-neg_embedding, center_unsqueeze) # [batch_size, 2C*K, 1]
        # squeeze() 去掉维度是1的维度
        log_pos = log_pos.squeeze()  # [batch_size, 2C]
        log_neg = log_neg.squeeze()  # [batch_size, 2C*K]

        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)

        loss = log_pos + log_neg

        return -loss

    def input_embedding(self,):
        return self.in_embed.weight.data.cpu().numpy()

    def output_embedding(self,):
        return self.out_embed.weight.data.cpu().numpy()

In [7]:
    model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
    model = model.to(DEVICE)

# train

In [8]:
from torch.optim import SGD, Adam

def model_train(model, dataloader, EMBEDDING_SIZE, LEARNING_RATE,NUM_EPOCHS, vnum = 1000):
    optimizer = SGD(model.parameters(), lr = LEARNING_RATE)
    for e in range(NUM_EPOCHS):
        for i, (center, pos, neg) in enumerate(dataloader):
            center = center.to(DEVICE)
            pos = pos.to(DEVICE)
            neg = neg.to(DEVICE)

            optimizer.zero_grad()
            loss = model(center, pos, neg).mean()
            loss.backward()
            optimizer.step()
            
            if i % vnum == 0:
                print("epoch: {}, iter: {}, loss: {}".format(e,i,loss.item()))
        torch.save(model.state_dict(), "embedding_weights-{}.th".format(EMBEDDING_SIZE))

In [9]:
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
    
model_train(model, dataloader, EMBEDDING_SIZE, LEARNING_RATE, NUM_EPOCHS, vnum = 1000)

epoch: 0, iter: 0, loss: 420.0474853515625
epoch: 0, iter: 1000, loss: 20.99519157409668


KeyboardInterrupt: 

In [10]:
import gc
gc.collect()

0