In [18]:
import os
import gdown
import logging
import easydict
from collections import Counter

In [2]:
logger = logging.getLogger(__name__)

In [29]:
args = easydict.EasyDict({
    "data_dir" : "C:/Users/USER/Desktop/workspace/practice/korean-ner/data",
    "data_name" : "train.tsv",
    "vocab_dir" : "C:/Users/USER/Desktop/workspace/practice/korean-ner/vocab",
    "wordvec_dir" : "C:/Users/USER/Desktop/workspace/practice/korean-ner/wordvec",
    "w2v_file" : "word_vector_300d.vec",
    "word_vocab_size" : 400000,
    "char_vocab_size" : 400000,

})

데이터 불러오기

In [4]:
def get_data(args):
    with open(os.path.join(args.data_dir,args.data_name), 'r', encoding='utf-8') as f:
        document = f.readlines()
        print(document)

build_vocab

In [5]:
def build_vocab(args):
    
    # load data & split sentence, ner_tag
    sentences , ner_tags = [], []
    with open(os.path.join(args.data_dir,args.data_name), 'r', encoding='utf-8') as f:
        document = f.readlines()
        for line in document:
            sentence , ner_tag = line.split('\t')
            sentences.append(sentence)
            ner_tags.append(ner_tag)

    # total word, char list 만들기
    total_word , total_char = [], [] 
    for sentence in sentences:
        sentence = sentence.split()
        for word in sentence:
            for char in word:
                total_char.append(char)
            total_word.append(word)

    # vocab_dir  만들기
    if not os.path.exists(args.vocab_dir):
        os.makedirs(args.vocab_dir)

    # build vocab (word, char)
    word_vocab, char_vocab = [], []

    word_vocab_path = os.path.join(args.vocab_dir, "word_vocab")
    char_vocab_path = os.path.join(args.vocab_dir, "char_vocab")

    word_counts = Counter(total_word)
    word_vocab.append("PAD")
    word_vocab.append("UNK")
    word_vocab.extend([x[0] for x in word_counts.most_common()])
    logger.info("Total word vocabulary size: {}".format(len(word_vocab)))

    with open(word_vocab_path, 'w', encoding='utf-8') as f:
            for word in word_vocab:
                f.write(word + "\n")

    char_counts = Counter(total_char)
    char_vocab.append("PAD")
    char_vocab.append("UNK")
    char_vocab.extend([x[0] for x in char_counts.most_common()])
    logger.info("Total char vocabulary size: {}".format(len(char_vocab)))

    with open(char_vocab_path, 'w', encoding='utf-8') as f:
            for char in char_vocab:
                f.write(char + "\n")

    # Set the exact vocab size
    # If the original vocab size is smaller than args.vocab_size, then set args.vocab_size to original one
    with open(word_vocab_path, 'r', encoding='utf-8') as f:
        word_list = f.readlines()
        args.word_vocab_size = min(len(word_list), args.word_vocab_size)

    with open(char_vocab_path, 'r', encoding='utf-8') as f:
        char_list = f.readlines()
        args.char_vocab_size = min(len(char_list), args.char_vocab_size)

    logger.info("args.word_vocab_size: {}".format(args.word_vocab_size))
    logger.info("args.char_vocab_size: {}".format(args.char_vocab_size))


load_vocab

In [12]:
def load_vocab(args):
    word_vocab_path = os.path.join(args.vocab_dir, "word_vocab")
    char_vocab_path = os.path.join(args.vocab_dir, "char_vocab")


    if not os.path.exists(word_vocab_path):
        logger.warning("Please build word vocab first!!!")
        return

    if not os.path.exists(char_vocab_path):
        logger.warning("Please build char vocab first!!!")
        return

    word_vocab = {}
    word_ids_to_tokens = []

    #load word_vocab
    with open(word_vocab_path,'r', encoding='utf-8') as f:
        word_list = f.readlines()
        args.word_vocab_size = min(len(word_list), args.word_vocab_size)

        for idx, word in enumerate(word_list[:args.word_vocab_size]):
            word = word.strip()
            word_vocab[word] = idx
            word_ids_to_tokens.append(word) 
            # print(word_ids_to_tokens)

    char_vocab = {}
    char_ids_to_tokens = []

    #load_char_vocab
    with open(char_vocab_path,'r', encoding='utf-8') as f:
        char_list = f.readlines()
        args.char_vocab_size = min(len(char_list), args.char_vocab_size)

        for idx, char in enumerate(char_list[:args.char_vocab_size]):
            char = char.strip()
            char_vocab[char] = idx
            char_ids_to_tokens.append(char) 

    return word_vocab, word_ids_to_tokens, char_vocab, char_ids_to_tokens