In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time
import argparse, os, sys
from collections import defaultdict
from gensim.models import Word2Vec

In [3]:
class CBoWModel(object):

    def __init__(self, train_fname, embedding_fname, model_fname, embedding_corpus_fname,
                 embedding_method="fasttext", is_weighted=True, average=False, dim=100, tokenizer_name="mecab"):
        # configurations
        make_save_path(model_fname)
        self.dim = dim
        self.average = average
        if is_weighted:
            model_full_fname = model_fname + "-weighted"
        else:
            model_full_fname = model_fname + "-original"
        self.tokenizer = get_tokenizer(tokenizer_name)
        if is_weighted:
            # ready for weighted embeddings
            self.embeddings = self.load_or_construct_weighted_embedding(embedding_fname, embedding_method, embedding_corpus_fname)
            print("loading weighted embeddings, complete!")
        else:
            # ready for original embeddings
            words, vectors = self.load_word_embeddings(embedding_fname, embedding_method)
            self.embeddings = defaultdict(list)
            for word, vector in zip(words, vectors):
                self.embeddings[word] = vector
            print("loading original embeddings, complete!")
        if not os.path.exists(model_full_fname):
            print("train Continuous Bag of Words model")
            self.model = self.train_model(train_fname, model_full_fname)
        else:
            print("load Continuous Bag of Words model")
            self.model = self.load_model(model_full_fname)

    def evaluate(self, test_data_fname, batch_size=3000, verbose=False):
        print("evaluation start!")
        test_data = self.load_or_tokenize_corpus(test_data_fname)
        data_size = len(test_data)
        num_batches = int((data_size - 1) / batch_size) + 1
        eval_score = 0
        for batch_num in range(num_batches):
            batch_sentences = []
            batch_tokenized_sentences = []
            batch_labels = []
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            features = test_data[start_index:end_index]
            for feature in features:
                sentence, tokens, label = feature
                batch_sentences.append(sentence)
                batch_tokenized_sentences.append(tokens)
                batch_labels.append(label)
            preds, curr_eval_score = self.predict_by_batch(batch_tokenized_sentences, batch_labels)
            eval_score += curr_eval_score
        if verbose:
            for sentence, pred, label in zip(batch_sentences, preds, batch_labels):
                print(sentence, ", pred:", pred, ", label:", label)
        print("# of correct:", str(eval_score), ", total:", str(len(test_data)), ", score:", str(eval_score / len(test_data)))

    def predict(self, sentence):
        tokens = self.tokenizer.morphs(sentence)
        sentence_vector = self.get_sentence_vector(tokens)
        scores = np.dot(self.model["vectors"], sentence_vector)
        pred = self.model["labels"][np.argmax(scores)]
        return pred

    def predict_by_batch(self, tokenized_sentences, labels):
        sentence_vectors, eval_score = [], 0
        for tokens in tokenized_sentences:
            sentence_vectors.append(self.get_sentence_vector(tokens))
        scores = np.dot(self.model["vectors"], np.array(sentence_vectors).T)
        preds = np.argmax(scores, axis=0)
        for pred, label in zip(preds, labels):
            if self.model["labels"][pred] == label:
                eval_score += 1
        return preds, eval_score

    def get_sentence_vector(self, tokens):
        vector = np.zeros(self.dim)
        for token in tokens:
            if token in self.embeddings.keys():
                vector += self.embeddings[token]
        if not self.average:
            vector /= len(tokens)
        vector_norm = np.linalg.norm(vector)
        if vector_norm != 0:
            unit_vector = vector / vector_norm
        else:
            unit_vector = np.zeros(self.dim)
        return unit_vector

    def load_or_tokenize_corpus(self, fname):
        data = []
        if os.path.exists(fname + "-tokenized"):
            with open(fname + "-tokenized", "r") as f1:
                for line in f1:
                    sentence, tokens, label = line.strip().split("\u241E")
                    data.append([sentence, tokens.split(), label])
        else:
            with open(fname, "r") as f2, open(fname + "-tokenized", "w") as f3:
                for line in f2:
                    sentence, label = line.strip().split("\u241E")
                    tokens = self.tokenizer.morphs(sentence)
                    data.append([sentence, tokens, label])
                    f3.writelines(sentence + "\u241E" + ' '.join(tokens) + "\u241E" + label + "\n")
        return data

    def compute_word_frequency(self, embedding_corpus_fname):
        total_count = 0
        words_count = defaultdict(int)
        with open(embedding_corpus_fname, "r") as f:
            for line in f:
                tokens = line.strip().split()
                for token in tokens:
                    words_count[token] += 1
                    total_count += 1
        return words_count, total_count

    def load_word_embeddings(self, vecs_fname, method):
        if method == "word2vec":
            model = Word2Vec.load(vecs_fname)
            words = model.wv.index2word
            vecs = model.wv.vectors
        else:
            words, vecs = [], []
            with open(vecs_fname, 'r', encoding='utf-8') as f1:
                if "fasttext" in method:
                    next(f1)  # skip head line
                for line in f1:
                    if method == "swivel":
                        splited_line = line.replace("\n", "").strip().split("\t")
                    else:
                        splited_line = line.replace("\n", "").strip().split(" ")
                    words.append(splited_line[0])
                    vec = [float(el) for el in splited_line[1:]]
                    vecs.append(vec)
        return words, vecs

    def load_or_construct_weighted_embedding(self, embedding_fname, embedding_method, embedding_corpus_fname, a=0.0001):
        dictionary = {}
        if os.path.exists(embedding_fname + "-weighted"):
            # load weighted word embeddings
            with open(embedding_fname + "-weighted", "r") as f2:
                for line in f2:
                    word, weighted_vector = line.strip().split("\u241E")
                    weighted_vector = [float(el) for el in weighted_vector.split()]
                    dictionary[word] = weighted_vector
        else:
            # load pretrained word embeddings
            words, vecs = self.load_word_embeddings(embedding_fname, embedding_method)
            # compute word frequency
            words_count, total_word_count = self.compute_word_frequency(embedding_corpus_fname)
            # construct weighted word embeddings
            with open(embedding_fname + "-weighted", "w") as f3:
                for word, vec in zip(words, vecs):
                    if word in words_count.keys():
                        word_prob = words_count[word] / total_word_count
                    else:
                        word_prob = 0.0
                    weighted_vector = (a / (word_prob + a)) * np.asarray(vec)
                    dictionary[word] = weighted_vector
                    f3.writelines(word + "\u241E" + " ".join([str(el) for el in weighted_vector]) + "\n")
        return dictionary

    def train_model(self, train_data_fname, model_fname):
        model = {"vectors": [], "labels": [], "sentences": []}
        train_data = self.load_or_tokenize_corpus(train_data_fname)
        with open(model_fname, "w") as f:
            for sentence, tokens, label in train_data:
                tokens = self.tokenizer.morphs(sentence)
                sentence_vector = self.get_sentence_vector(tokens)
                model["sentences"].append(sentence)
                model["vectors"].append(sentence_vector)
                model["labels"].append(label)
                str_vector = " ".join([str(el) for el in sentence_vector])
                f.writelines(sentence + "\u241E" + " ".join(tokens) + "\u241E" + str_vector + "\u241E" + label + "\n")
        return model

    def load_model(self, model_fname):
        model = {"vectors": [], "labels": [], "sentences": []}
        with open(model_fname, "r") as f:
            for line in f:
                sentence, _, vector, label = line.strip().split("\u241E")
                vector = np.array([float(el) for el in vector.split()])
                model["sentences"].append(sentence)
                model["vectors"].append(vector)
                model["labels"].append(label)
        return model

책에서 CBowModel의 핵심이라고 소개한  코드 4-43, 4-44 위주로 살펴보자.

### 코드 4-43

In [4]:
def compute_word_frequency(self, embedding_corpus_fname):
        total_count = 0
        words_count = defaultdict(int)
        with open(embedding_corpus_fname, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                for token in tokens:
                    words_count[token] += 1
                    total_count += 1
        return words_count, total_count

문장을 하나씩 읽으면서, 토큰으로 나눈 뒤 
등장 순서에 관계없이 사전형태를 만든다.

### 코드 4-44

In [5]:
def load_or_construct_weighted_embedding(self, embedding_fname,
                                            embedding_method,
                                            embedding_corpus_fname, a=0.0001):
        dictionary = {}
        
        # 가중임베딩을 만든적이 있다면 계산하지 않고 계산한 임베딩을 불러온다.
        if os.path.exists(embedding_fname + '-weighted'):
            # load weighted word embeddings
            with open(embedding_fname + '-weighted', 'r') as f2:
                for line in f2:
                    word, weighted_vector = line.strip().split('\u241E')
                    weighted_vector = \
                        [float(el) for el in weighted_vector.split()]
                    dictionary[word] = weighted_vector
                    
        # 여기서 부터 하나씩 뜯어보자!
        else:
            # load pretrained word embeddings
            words, vecs = self.load_word_embeddings(embedding_fname, embedding_method)

            # compute word frequency
            words_count, total_count = compute_word_frequency(embedding_corpus_fname)
            
            # construct weighted word embeddings
            with open(embeding_fname + '-weighted', 'w') as f3:
                for word, vec in zip(words, vecs):
                    if word in words_count.keys():
                        word_prob = words_count[word] / total_count
                    else:
                        word_prob = 0.0
                    weighted_vector = ( a/ (word_prob + a) ) * np.asarray(vec)
                    dictionary[word] = weighted_vector
                    f3.writelines(word + '\u241E' + " ".join([str(el) for el in weighted_vector]) + "\n")
        return dictionary

In [6]:
# words, vecs = self.load_word_embeddings(embedding_fname, embedding_method)

def load_word_embeddings(self, vecs_fname, method):
    if method == 'word2vec':
        model = Word2Vec.load(vecs_fname)
        words = model.wv.index2word
        vecs = model.wv.vectors
    else:
        words, vecs = [], []
        with open(vecs_fname, 'r', encoding='utf-8') as f1:
            if 'fasttext' in method:
                next(f1) # skip head line
            for line in f1:
                if method == 'swivel':
                    splited_line = line.replace('\n', '').strip().split('\t')
                else:
                    splited_line = line.replace('\n', '').strip().split(" ")
                words.append(splited_line[0])
                vec = [float(el) for el in splited_line[1:]]
                vecs.append(vec)
    return words, vecs

각 method에 따른 임베딩 백터를 불러오는 방법이다.

word2vec, fasttext, swivel에 방법에 따라서 split할떄, 약간의 차이가 있는것으로 보여진다.

In [7]:
# construct weighted word embeddings
# with open(embeding_fname + '-weighted', 'w') as f3:
#     for word, vec in zip(words, vecs):
#         if word in words_count.keys():
#             word_prob = words_count[word] / total_count
#         else:
#             word_prob = 0.0
#         weighted_vector = ( a/ (word_prob + a) ) * np.asarray(vec)
#         dictionary[word] = weighted_vector
#         f3.writelines(word + '\u241E' + " ".join([str(el) for el in weighted_vector]) + "\n")
# return dictionary

words, vecs 는 단어에 따른 임베딩 벡터들이다.

words_count.keys()는 **compute_word_frequency**에서 계산한  단어: 단어 빈도수의 사전형태이다

word_prob는 words 중 전체 빈도수에서 단어 word가 나온 빈도수를 확률로 나타낸 값

weighted_vector에서 단어 등장확률을 반영한 가중치를 곱해준다. (a = 0.0001로 상수이다.)

다시 dictionary에서 단어: 가중 임베딩 형태로 return해준다.

### Training

에러가 나서 아래와 같이 코드에서 약간의 수정을 했다.

In [8]:
#책의 환경이 linux이다 보니, mecab을 쓰기 위해 아래와 같이 수정

from konlpy.tag import Okt, Komoran, Hannanum, Kkma
from eunjeon import Mecab

def get_tokenizer(tokenizer_name):

    tokenizer_dict={
        'komoran':Komoran(),
        'okt':Okt(),
        'mecab':Mecab(),
        'hannanum':Hannanum(),
        'kkma':Kkma()
    }
    try:
        tokenizer=tokenizer_dict[tokenizer_name]
    except:
        tokenizer=Mecab()
    return tokenizer

In [9]:
# 이 부분은 이상하게 에러가 나서, word2vec만 남겨두었다.

# def load_word_embeddings(vecs_fname, method):
#     if method == 'word2vec':
#         model = Word2Vec.load(vecs_fname)
#         words = model.wv.index2word
#         vecs = model.wv.vectors
#     return words, vecs

In [73]:
from io import open

class CBoWModel(object):

    def __init__(self, train_fname, embedding_fname, model_fname, embedding_corpus_fname,
                 embedding_method="fasttext", is_weighted=True, average=False, dim=100, tokenizer_name="mecab"):
        # configurations
        make_save_path(model_fname)
        self.dim = dim
        self.average = average
        if is_weighted:
            model_full_fname = model_fname + "-weighted"
        else:
            model_full_fname = model_fname + "-original"
        self.tokenizer = get_tokenizer(tokenizer_name)
        if is_weighted:
            # ready for weighted embeddings
            self.embeddings = self.load_or_construct_weighted_embedding(embedding_fname, embedding_method, embedding_corpus_fname)
            print("loading weighted embeddings, complete!")
        else:
            # ready for original embeddings
            words, vectors = self.load_word_embeddings(embedding_fname, embedding_method)
            self.embeddings = defaultdict(list)
            for word, vector in zip(words, vectors):
                self.embeddings[word] = vector
            print("loading original embeddings, complete!")
        if not os.path.exists(model_full_fname):
            print("train Continuous Bag of Words model")
            self.model = self.train_model(train_fname, model_full_fname)
        else:
            print("load Continuous Bag of Words model")
            self.model = self.load_model(model_full_fname)

    def evaluate(self, test_data_fname, batch_size=3000, verbose=False):
        print("evaluation start!")
        test_data = self.load_or_tokenize_corpus(test_data_fname)
        data_size = len(test_data)
        num_batches = int((data_size - 1) / batch_size) + 1
        eval_score = 0
        for batch_num in range(num_batches):
            batch_sentences = []
            batch_tokenized_sentences = []
            batch_labels = []
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            features = test_data[start_index:end_index]
            for feature in features:
                sentence, tokens, label = feature
                batch_sentences.append(sentence)
                batch_tokenized_sentences.append(tokens)
                batch_labels.append(label)
            preds, curr_eval_score = self.predict_by_batch(batch_tokenized_sentences, batch_labels)
            eval_score += curr_eval_score
        if verbose:
            for sentence, pred, label in zip(batch_sentences, preds, batch_labels):
                print(sentence, ", pred:", pred, ", label:", label)
        print("# of correct:", str(eval_score), ", total:", str(len(test_data)), ", score:", str(eval_score / len(test_data)))

    def predict(self, sentence):
        tokens = self.tokenizer.morphs(sentence)
        sentence_vector = self.get_sentence_vector(tokens)
        scores = np.dot(self.model["vectors"], sentence_vector)
        pred = self.model["labels"][np.argmax(scores)]
        return pred

    def predict_by_batch(self, tokenized_sentences, labels):
        sentence_vectors, eval_score = [], 0
        for tokens in tokenized_sentences:
            sentence_vectors.append(self.get_sentence_vector(tokens))
        scores = np.dot(self.model["vectors"], np.array(sentence_vectors).T)
        preds = np.argmax(scores, axis=0)
        for pred, label in zip(preds, labels):
            if self.model["labels"][pred] == label:
                eval_score += 1
        return preds, eval_score

    def get_sentence_vector(self, tokens):
        vector = np.zeros(self.dim)
        for token in tokens:
            if token in self.embeddings.keys():
                vector += self.embeddings[token]
        if not self.average:
            vector /= len(tokens)
        vector_norm = np.linalg.norm(vector)
        if vector_norm != 0:
            unit_vector = vector / vector_norm
        else:
            unit_vector = np.zeros(self.dim)
        return unit_vector

    def load_or_tokenize_corpus(self, fname):
        data = []
        if os.path.exists(fname + "-tokenized"):
            with open(fname + "-tokenized", "r") as f1:
                for line in f1:
                    sentence, tokens, label = line.strip().split("\u241E")
                    data.append([sentence, tokens.split(), label])
        else:
            with open(fname, "r") as f2, open(fname + "-tokenized", "w") as f3:
                for line in f2:
                    sentence, label = line.strip().split("\u241E")
                    tokens = self.tokenizer.morphs(sentence)
                    data.append([sentence, tokens, label])
                    f3.writelines(sentence + "\u241E" + ' '.join(tokens) + "\u241E" + label + "\n")
        return data


    def compute_word_frequency(self, embedding_corpus_fname):
        total_count = 0
        words_count = defaultdict(int)
        with open(embedding_corpus_fname, "r") as f:
            for line in f:
                tokens = line.strip().split()
                for token in tokens:
                    words_count[token] += 1
                    total_count += 1
        return words_count, total_count

    def load_word_embeddings(self, vecs_fname, method):
        if method == "word2vec":
            model = Word2Vec.load(vecs_fname)
            words = model.wv.index2word
            vecs = model.wv.vectors
        return words, vecs

    def load_or_construct_weighted_embedding(self, embedding_fname, embedding_method, embedding_corpus_fname, a=0.0001):
        dictionary = {}
        if os.path.exists(embedding_fname + "-weighted"):
            # load weighted word embeddings
            with open(embedding_fname + "-weighted", "r") as f2:
                for line in f2:
                    word, weighted_vector = line.strip().split("\u241E")
                    weighted_vector = [float(el) for el in weighted_vector.split()]
                    dictionary[word] = weighted_vector
        else:
            # load pretrained word embeddings
            words, vecs = self.load_word_embeddings(embedding_fname, embedding_method)
            # compute word frequency
            words_count, total_word_count = self.compute_word_frequency(embedding_corpus_fname)
            # construct weighted word embeddings
            with open(embedding_fname + "-weighted", "w") as f3:
                for word, vec in zip(words, vecs):
                    if word in words_count.keys():
                        word_prob = words_count[word] / total_word_count
                    else:
                        word_prob = 0.0
                    weighted_vector = (a / (word_prob + a)) * np.asarray(vec)
                    dictionary[word] = weighted_vector
                    f3.writelines(word + "\u241E" + " ".join([str(el) for el in weighted_vector]) + "\n")
        return dictionary

    def train_model(self, train_data_fname, model_fname):
        model = {"vectors": [], "labels": [], "sentences": []}
        train_data = self.load_or_tokenize_corpus(train_data_fname)
        with open(model_fname, "w") as f:
            for sentence, tokens, label in train_data:
                tokens = self.tokenizer.morphs(sentence)
                sentence_vector = self.get_sentence_vector(tokens)
                model["sentences"].append(sentence)
                model["vectors"].append(sentence_vector)
                model["labels"].append(label)
                str_vector = " ".join([str(el) for el in sentence_vector])
                f.writelines(sentence + "\u241E" + " ".join(tokens) + "\u241E" + str_vector + "\u241E" + label + "\n")
        return model

    def load_model(self, model_fname):
        model = {"vectors": [], "labels": [], "sentences": []}
        with open(model_fname, "r") as f:
            for line in f:
                sentence, _, vector, label = line.strip().split("\u241E")
                vector = np.array([float(el) for el in vector.split()])
                model["sentences"].append(sentence)
                model["vectors"].append(vector)
                model["labels"].append(label)
        return model

In [67]:
def make_save_path(full_path):
    if full_path[:4] == "data":
        full_path = os.path.join(os.path.abspath("."), full_path)
    model_path = '/'.join(full_path.split("/")[:-1])
    if not os.path.exists(model_path):
        os.makedirs(model_path)

In [68]:
train_fname = '../data/processed/processed_ratings_train.txt'
embedding_fname = '../data/word-embeddings/word2vec/word2vec'
model_fname = '../data/word-embeddings/cbow/word2vec'
embedding_corpus_fname = '../data/tokenized/corpus_mecab.txt'

In [74]:
model = CBoWModel(train_fname, embedding_fname, model_fname, embedding_corpus_fname,
                  embedding_method='word2vec', is_weighted=False, average=False, dim=100, tokenizer_name='mecab')

loading original embeddings, complete!
train Continuous Bag of Words model


학습은 되는데 평가가 되지 않는다.

아무래도 encoding 문제로 생각된다.

@load_or_tokenize_corpus에서 split할때 자꾸 에러가 뜬다.

In [75]:
test_corpus_path = '../data/processed/processed_ratings_test.txt'

model.evaluate(test_corpus_path, 3000, False)

evaluation start!


TypeError: zip argument #1 must support iteration