참고 : https://github.com/weberrr/pytorch_word2vec

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from collections import deque
import torch.optim as optim
from tqdm import tqdm

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)    

cuda


### Huffman tree 쌓기

In [3]:
class HuffmanNode:
    def __init__(self, word_id, frequency):
        self.word_id = word_id  
        self.frequency = frequency 
        self.left_child = None
        self.right_child = None
        self.father = None
        self.Huffman_code = []  
        self.path = []  


class HuffmanTree:
    def __init__(self, wordid_frequency_dict):
        self.word_count = len(wordid_frequency_dict)  
        self.wordid_code = dict()
        self.wordid_path = dict()
        self.root = None
        unmerge_node_list = [HuffmanNode(wordid, frequency) for wordid, frequency in
                             wordid_frequency_dict.items()] 
        self.huffman = [HuffmanNode(wordid, frequency) for wordid, frequency in
                        wordid_frequency_dict.items()]  
        # huffman tree
        self.build_tree(unmerge_node_list)
        # huffman code
        self.generate_huffman_code_and_path()

    def merge_node(self, node1, node2):
        sum_frequency = node1.frequency + node2.frequency
        mid_node_id = len(self.huffman) 
        father_node = HuffmanNode(mid_node_id, sum_frequency)
        if node1.frequency >= node2.frequency: # 빈도수가 높으면 왼쪽
            father_node.left_child = node1
            father_node.right_child = node2
        else:
            father_node.left_child = node2
            father_node.right_child = node1
        self.huffman.append(father_node)
        return father_node

    def build_tree(self, node_list):
        while len(node_list) > 1:
            i1 = 0  
            i2 = 1  
            if node_list[i2].frequency < node_list[i1].frequency:
                [i1, i2] = [i2, i1]
            for i in range(2, len(node_list)):
                if node_list[i].frequency < node_list[i2].frequency:
                    i2 = i
                    if node_list[i2].frequency < node_list[i1].frequency:
                        [i1, i2] = [i2, i1]
            father_node = self.merge_node(node_list[i1], node_list[i2])  
            if i1 < i2:
                node_list.pop(i2)
                node_list.pop(i1)
            elif i1 > i2:
                node_list.pop(i1)
                node_list.pop(i2)
            else:
                raise RuntimeError('i1 should not be equal to i2')
            node_list.insert(0, father_node)  
        self.root = node_list[0]

    def generate_huffman_code_and_path(self):
        stack = [self.root]
        while len(stack) > 0:
            node = stack.pop()
            
            while node.left_child or node.right_child:
                code = node.Huffman_code
                path = node.path
                node.left_child.Huffman_code = code + [1] # 좌로 가면 1
                node.right_child.Huffman_code = code + [0] # 우로 가면 0
                node.left_child.path = path + [node.word_id]
                node.right_child.path = path + [node.word_id]
                
                stack.append(node.right_child)
                node = node.left_child
            word_id = node.word_id
            word_code = node.Huffman_code
            word_path = node.path
            self.huffman[word_id].Huffman_code = word_code
            self.huffman[word_id].path = word_path
            
            self.wordid_code[word_id] = word_code
            self.wordid_path[word_id] = word_path

    
    def get_all_pos_and_neg_path(self):
        positive = []  # 왼쪽
        negative = []  # 오르쪽
        for word_id in range(self.word_count):
            pos_id = []  
            neg_id = []  
            for i, code in enumerate(self.huffman[word_id].Huffman_code):
                if code == 1:
                    pos_id.append(self.huffman[word_id].path[i])
                else:
                    neg_id.append(self.huffman[word_id].path[i])
            positive.append(pos_id)
            negative.append(neg_id)
        return positive, negative

### input data

In [4]:
class InputData:
    def __init__(self, input_file_name, min_count):
        self.input_file_name = input_file_name
        self.input_file = open(self.input_file_name)  
        self.min_count = min_count  
        self.wordId_frequency_dict = {}
        self.word_count = 0  
        self.word_count_sum = 0
        self.sentence_count = 0  
        self.id2word_dict = {}
        self.word2id_dict = {}
        self._init_dict()  
        self.huffman_tree = HuffmanTree(self.wordId_frequency_dict) 
        self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path()
        self.word_pairs_queue = deque()
        
        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)
        print('Tree Node is:', len(self.huffman_tree.huffman))

    def _init_dict(self):
        word_freq = {}
        
        for line in self.input_file:
            line = line.strip().split(' ')  
            self.word_count_sum += len(line)
            self.sentence_count += 1
            for word in line:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
        word_id = 0
        
        for per_word, per_count in word_freq.items():
            if per_count < self.min_count:  
                self.word_count_sum -= per_count
                continue
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    
    def get_batch_pairs(self, batch_size, window_size):
        while len(self.word_pairs_queue) < batch_size:
            for _ in range(10000):  
                self.input_file = open(self.input_file_name, encoding="utf-8")
                sentence = self.input_file.readline()
                if sentence is None or sentence == '':
                    continue
                wordId_list = []  
                for word in sentence.strip().split(' '):
                    try:
                        word_id = self.word2id_dict[word]
                        wordId_list.append(word_id)
                    except:
                        continue
                
                for i, wordId_w in enumerate(wordId_list):
                    context_ids = []
                    for j, wordId_u in enumerate(wordId_list[max(i - window_size, 0):i + window_size + 1]):
                        assert wordId_w < self.word_count
                        assert wordId_u < self.word_count
                        if i == j:  
                            continue
                        elif max(0, i - window_size + 1) <= j <= min(len(wordId_list), i + window_size - 1):
                            context_ids.append(wordId_u)
                    if len(context_ids) == 0:
                        continue
                    self.word_pairs_queue.append((context_ids, wordId_w))
        result_pairs = []  
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs

    def get_pairs(self, pos_pairs):
        neg_word_pair = []
        pos_word_pair = []
        for pair in pos_pairs:
            pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]])
            neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]])
        return pos_word_pair, neg_word_pair

    
    def evaluate_pairs_count(self, window_size):
        return self.word_count_sum * (2 * window_size - 1)  - (self.sentence_count - 1) * (1 + window_size) * window_size

### CBOW model

In [5]:
class CBOWModel(nn.Module):
    def __init__(self, emb_size, emb_dimension):
        super(CBOWModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)   #(단어집합크기, 임베딩할 벡터의 차원, sparse)
        self.w_embeddings = nn.Embedding(2*self.emb_size-1, self.emb_dimension, sparse=True)
        self._init_embedding()
    
    def _init_embedding(self):
        int_range = 0.5 / self.emb_dimension
        self.u_embeddings.weight.data.uniform_(-int_range, int_range) # 초기 가중치
        self.w_embeddings.weight.data.uniform_(-int_range, int_range)
        
    def compute_context_matrix(self, u):
        pos_u_emb = []
        for per_Xw in u:
            per_u_emb = self.u_embeddings(torch.LongTensor(per_Xw))   #64비트의 부호 있는 정수 -> LongTensor
            # Xw = Vwi
            Vwi = per_u_emb.data.numpy()
            Vwi_numpy = np.sum(Vwi, axis=0)
            Vwi_list = Vwi_numpy.tolist()
            pos_u_emb.append(Vwi_list)
        pos_u_emb = torch.FloatTensor(pos_u_emb)
        return pos_u_emb
    
    def forward(self, pos_u, pos_w, neg_u, neg_w):
        pos_u_emb = self.compute_context_matrix(pos_u)
        pos_w_emb = self.w_embeddings(torch.LongTensor(pos_w))
        neg_u_emb = self.compute_context_matrix(neg_u)
        neg_w_emb = self.w_embeddings(torch.LongTensor(neg_w))
        
        score_1 = torch.mul(pos_u_emb, pos_w_emb).squeeze() # 자녀가 왼쪽 노드
        score_2 = torch.sum(score_1, dim=1)
        score_3 = F.logsigmoid(-1 * score_2)
        neg_score_1 = torch.mul(neg_u_emb, neg_w_emb).squeeze() # 자녀가 오른쪽 노드
        neg_score_2 = torch.sum(neg_score_1, dim=1)
        neg_score_3 = F.logsigmoid(neg_score_2)
        
        loss = torch.sum(score_3) + torch.sum(neg_score_3) # P(w | wi)에 log를 씌웠으므로 합으로 표현해도 무방
        return -1 * loss # 최소화 문제로 변환
    
    def save_embedding(self, id2word_dict, file_name):
        embeddings = self.u_embeddings.weight.data.numpy()
        file_output = open(file_name, 'w')
        file_output.write('%d %d\n' % (self.emb_size, self.emb_dimension))
        for id, word in id2word_dict.items():
            e = embeddings[id]
            e = ' '.join(map(lambda x: str(x), e))
            file_output.write('%s %s\n' % (word, e))

### Word2Vec

In [6]:
WINDOW_SIZE = 4  
BATCH_SIZE = 10  
MIN_COUNT = 1  
EMB_DIMENSION = 100  
LR = 0.02  
NEG_COUNT = 4  


class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("CBOW Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)

            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u,pos_v,neg_u,neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

        self.model.save_embedding(self.data.id2word_dict, self.output_file_name)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
if __name__ == '__main__':
    word2vec = Word2Vec(input_file_name='/content/drive/MyDrive/DSL/embedding/martinluther.txt', output_file_name="word_embedding.txt")
    word2vec.train()

  0%|          | 0/1399 [00:00<?, ?it/s]

Word Count is: 695
Word Count Sum is 2150
Sentence Count is: 54
Tree Node is: 1389
CBOW Training......
pairs_count 13990
batch_count 1399.0


100%|██████████| 1399/1399 [00:12<00:00, 111.28it/s]


In [9]:
from google.colab import files
files.download("word_embedding.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>