In [1]:
#coding:utf-8
import numpy as np
import torch as t
from nltk.tokenize import word_tokenize
from torch import nn
from tqdm import tqdm

In [2]:
import sys
sys.path.append('C:/Users/SEONGGYUN/ATAE-LSTM/ATAE-LSTM')
import Ipynb_importer
from config import opt

In [9]:
class Emb(object):
    def __init__(self):
        # create and init the items below
        # self.embedding   string word ==> np.ndarray vector
        self.embedding = {}
        
        # load the pre-trained data
        with open(opt.embedding_root, 'r', encoding='UTF-8') as f:
            lines = []
            for i in tqdm(range(opt.word_max_input)):
                line = f.readline()
                if line != "":
                    lines.append(line)
            
            line_success = 0
            line_fail = 0
            for line in tqdm(lines):
                # l : "a 0.1 0.2 0.3 ..."
                if line[-1] == '\n':
                    line = line[:-1]
                token_and_vector = line.split(' ')
                if not len(token_and_vector)==opt.hidden_size + 1:
                    line_fail += 1
                    continue
                token, vector = token_and_vector[0], token_and_vector[1:]
                self.embedding[token.lower()] = np.array(vector, dtype=float)
                line_success += 1
            print('Embedding : successfully input {} pretrained word embeddings while {} failed'.format(line_success, line_fail))
        
        # create the items to modify and use dynamically below
        # self.dictionary    string word ==> int index
        # self.words         int index ==> string word
        # self.no_pretrained string word ==> int appearance
        self.dictionary = {}
        self.words = []
        self.no_pretrained = {}
        
        self.dictionary["<PADDING>"] = 0
        self.dictionary["<UNKNOWN>"] = 1
        self.words += ["<PADDING>", "<UNKNOWN>"]
        self.embedding["<PADDING>"] = np.zeros(opt.hidden_size, dtype=float)
        self.embedding["<UNKNOWN>"] = np.random.uniform(-opt.epsilon, opt.epsilon, opt.hidden_size)
        
        return
    
    def _get_dic_(self):
        return self.dictionary
    
    def _get_words_(self):
        return self.words
    
    def tokenize(self, sentence, max_length = opt.max_seq_len):
        sentence = word_tokenize(sentence)
        re = []
        for word in sentence:
            word = word.lower()
            if word in self.dictionary.keys():
                re.append(self.dictionary[word])
            else:
                re.append(self.dictionary["<UNKNOWN>"])
        assert len(re) <= max_length, "the input sentence exceeded the max_length: {}>{}".format(len(re), max_length)
        re += [self.dictionary["<PADDING>"]]*(max_length-len(re))
        return t.Tensor(re).long()
    
    def _make_layer_(self):
        weight = []
        for word in self.words:
            weight.append(self.embedding[word])
        
        layer = nn.Embedding.from_pretrained(t.FloatTensor(weight), freeze=False) #, padding_idx=0
        
        return layer
    
    def _add_word_(self, sentence):
        # para sentence : a string to be tokenized by nltk.tokenize.word_tokenize
        sentence = word_tokenize(sentence)
        for word in sentence:
            word = word.lower()
            if word in self.dictionary:
                continue
            elif word in self.embedding:
                # add this word into self.dictionary and self.words
                self.dictionary[word] = len(self.words)
                self.words.append(word)
                assert len(self.dictionary) == len(self.words)
            else:
                # word can reach only if there's no pre-trained embedding for it
                # if it's only a word that appears for too few times to be important
                # it will be treat as '_UNKNOWN_' which means all these low-frequency words share a same embedding
                if word not in self.no_pretrained:
                    self.no_pretrained[word] = 1
                else:
                    self.no_pretrained[word] += 1
                # if this no-pretrained word appears for at least opt.word_independence times
                # set an indepent embedding for it
                if self.no_pretrained[word] >= opt.word_independence:
                    self.no_pretrained.pop(word)
                    self.dictionary[word] = len(self.words)
                    self.words.append(word)
                    assert len(self.dictionary) == len(self.words)

                    # set an indepent embedding for it
                    # init from U(-ε,ε) 
                    self.embedding[word] = np.random.uniform(-opt.epsilon, opt.epsilon, opt.hidden_size)
        return

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SEONGGYUN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [14]:
if __name__=='__main__':
    emb = Emb()
    sentence = 'All the appetizers and salads were fabulous, the steak was mouth watering and the pasta was delicious!!!'
    print(emb.tokenize(sentence))
    emb._add_word_(sentence)
    print(emb.tokenize(sentence))
    print(emb._make_layer_().embedding_dim)
    print(emb._make_layer_().num_embeddings)

100%|██████████| 100000/100000 [00:00<00:00, 180598.87it/s]
100%|██████████| 100000/100000 [00:06<00:00, 15861.31it/s]


Embedding : successfully input 100000 pretrained word embeddings while 0 failed
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 2,  3,  4,  5,  6,  7,  8,  9,  3, 10, 11, 12, 13,  5,  3, 14, 11, 15,
        16, 16, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       