In [None]:
!pip install ratsnlp

In [2]:
from Korpora import Korpora
nsmc = Korpora.load('nsmc', force_download=True)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:00, 86.4MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 36.8MB/s]                           


In [5]:
import os

def write_lines(path, lines):
  with open(path, 'w', encoding='utf-8') as f:
    for line in lines:
      f.write(f'{line}\n')

write_lines('/root/train.txt', nsmc.train.get_all_texts())
write_lines('/root/test.txt', nsmc.get_all_texts())

In [6]:
os.makedirs('/content/drive/MyDrive/nlpbook/bbpe', exist_ok=True)

In [8]:
from tokenizers import ByteLevelBPETokenizer

bytebpe_tokenizer = ByteLevelBPETokenizer()
bytebpe_tokenizer.train(
    files=['/root/train.txt', '/root/test.txt'],
    vocab_size=10000,
    special_tokens=['[PAD']
)

bytebpe_tokenizer.save_model('/content/drive/MyDrive/nlpbook/bbpe')

['/content/drive/MyDrive/nlpbook/bbpe/vocab.json',
 '/content/drive/MyDrive/nlpbook/bbpe/merges.txt']

In [9]:
os.makedirs('/content/drive/MyDrive/nlpbook/wordpiece', exist_ok=True)

In [10]:
from tokenizers import BertWordPieceTokenizer
wordpiece_tokenizer = BertWordPieceTokenizer(lowercase=False)
wordpiece_tokenizer.train(
    files=['/root/train.txt', '/root/test.txt'],
    vocab_size=10000
)
wordpiece_tokenizer.save_model('/content/drive/MyDrive/nlpbook/wordpiece')

['/content/drive/MyDrive/nlpbook/wordpiece/vocab.txt']

In [11]:
from transformers import GPT2Tokenizer

tokenizer_gpt = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/nlpbook/bbpe')
tokenizer_gpt.pad_token = '[PAD]'

file /content/drive/MyDrive/nlpbook/bbpe/config.json not found


In [16]:
sentences = [
             '아 더빙.. 진짜 짜증나네요 목소리',
             '흠... 포스터보고 초딩영환줄... 오버연기조차 가볍지 않구나',
             '별루 였다..'
             ]

tokenized_sentences = [tokenizer_gpt.tokenize(sentence) for sentence in sentences]
tokenized_sentences

[['ìķĦ', 'ĠëįĶë¹Ļ', '..', 'Ġì§Ħì§ľ', 'Ġì§ľì¦ĿëĤĺ', 'ëĦ¤ìļĶ', 'Ġëª©ìĨĮë¦¬'],
 ['íĿł',
  '...',
  'Ġíı¬ìĬ¤íĦ°',
  'ë³´ê³ł',
  'Ġì´ĪëĶ©',
  'ìĺģíĻĺ',
  'ì¤Ħ',
  '...',
  'Ġìĺ¤ë²Ħ',
  'ìĹ°ê¸°',
  'ì¡°ì°¨',
  'Ġê°Ģë³į',
  'ì§Ģ',
  'ĠìķĬ',
  'êµ¬ëĤĺ'],
 ['ë³Ħë£¨', 'Ġìĺ', 'Ģëĭ¤', '..']]

In [18]:
batch_inputs = tokenizer_gpt(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True,
)

TypeError: ignored

In [19]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained('/content/drive/MyDrive/nlpbook/wordpiece', do_lower_case=False)

file /content/drive/MyDrive/nlpbook/wordpiece/config.json not found


In [20]:
sentences = [
             '아 더빙.. 진짜 짜증나네요 목소리',
             '흠... 포스터보고 초딩영환줄... 오버연기조차 가볍지 않구나',
             '별루 였다..'
             ]

tokenized_sentences = [tokenizer_bert.tokenize(sentence) for sentence in sentences]
tokenized_sentences

[['아', '더빙', '.', '.', '진짜', '짜증나', '##네요', '목소리'],
 ['흠',
  '.',
  '.',
  '.',
  '포스터',
  '##보고',
  '초딩',
  '##영환',
  '##줄',
  '.',
  '.',
  '.',
  '오버',
  '##연기',
  '##조차',
  '가볍',
  '##지',
  '않',
  '##구나'],
 ['별루', '였다', '.', '.']]

In [21]:
batch_inputs = tokenizer_bert(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True,
)

In [22]:
batch_inputs

{'input_ids': [[2, 621, 2635, 16, 16, 1993, 3680, 1990, 3386, 3, 0, 0], [2, 997, 16, 16, 16, 2603, 2045, 2801, 4928, 1326, 16, 3], [2, 3303, 9373, 16, 16, 3, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]}