# 어휘 집합 구축하기

In [2]:
!pip install ratsnlp
!pip install Korpora

Collecting ratsnlp

ERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.



  Using cached ratsnlp-1.0.52-py3-none-any.whl (42 kB)
Collecting flask>=1.1.4
  Using cached Flask-2.2.3-py3-none-any.whl (101 kB)
Collecting pytorch-lightning==1.6.1
  Using cached pytorch_lightning-1.6.1-py3-none-any.whl (582 kB)
Collecting transformers==4.10.0
  Using cached transformers-4.10.0-py3-none-any.whl (2.8 MB)
Collecting Korpora>=0.2.0
  Using cached Korpora-0.2.0-py3-none-any.whl (57 kB)
Collecting flask-ngrok>=0.0.25
  Using cached flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Collecting flask-cors>=3.0.10
  Using cached Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting Jinja2>=3.0
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting itsdangerous>=2.0
  Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Collecting Werkzeug>=2.2.2
  Using cached Werkzeug-2.2.3-py3-none-any.whl (233 kB)
Collecting typing-extensions>=4.0.0
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting torchmetrics>=0.4.1
  Using cached torchmetrics-0.1

Collecting Korpora
  Using cached Korpora-0.2.0-py3-none-any.whl (57 kB)
Collecting dataclasses>=0.6
  Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
Installing collected packages: dataclasses, Korpora
Successfully installed Korpora-0.2.0 dataclasses-0.6


In [3]:
from Korpora import Korpora
nsmc = Korpora.load('nsmc',force_download=True)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:00, 77.2MB/s]                                                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 50.6MB/s]


In [4]:
import os 

def write_lines(path,lines):
    with open(path,'w',encoding='utf-8') as f:
        for line in lines:
            f.write(f'{line}\n')
write_lines('train.txt',nsmc.train.get_all_texts())
write_lines('test.txt',nsmc.test.get_all_texts())

# GPT 모델

## 1. BPE 어휘 집합 구축

In [6]:
from tokenizers import ByteLevelBPETokenizer

os.makedirs('nlpbook/bbpe',exist_ok=True)
bytebpe_tokenizer = ByteLevelBPETokenizer()
bytebpe_tokenizer.train(
    files = ['train.txt','test.txt'],
    vocab_size=10000,
    special_tokens=["[PAD]"]
)
bytebpe_tokenizer.save_model('nlpbook/bbpe')

['nlpbook/bbpe\\vocab.json', 'nlpbook/bbpe\\merges.txt']

## 2. 토크나이저 선언 및 토큰화

In [9]:
from transformers import GPT2Tokenizer

tokenizer_gpt = GPT2Tokenizer.from_pretrained('nlpbook/bbpe')
tokenizer_gpt.pad_token='[PAD]'

In [12]:
# 예시 문장
sentences = []
with open('test.txt','r',encoding='utf-8') as f:
    for text in f.readlines()[100:103]:
        sentences.append(text)
batch_inputs = tokenizer_gpt(
    sentences,
    padding="max_length",
    max_length=12,
    truncation=True # 문장 잘림 허용 옵션
)

## 3. 실행 결과
> *두가지 입력값이 만들어짐*

### input_ids -> 토큰화 결과

In [14]:
batch_inputs['input_ids']

[[652, 6353, 1433, 7738, 2033, 3856, 9560, 1649, 14, 199, 0, 0],
 [1717, 456, 6316, 1484, 199, 0, 0, 0, 0, 0, 0, 0],
 [5999, 337, 534, 3306, 14, 302, 9936, 264, 4480, 311, 634, 432]]

### attention_mask -> 일반토큰과 패딩토큰 구분

In [15]:
batch_inputs['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
 [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

## Bert에서 사용하는 WordPieceTokenizer 어휘집합 구축

In [7]:
from tokenizers import BertWordPieceTokenizer

os.makedirs('nlpbook/wordpiece',exist_ok=True)
wordpiece_tokenizer = BertWordPieceTokenizer(lowercase=False)
wordpiece_tokenizer.train(
    files=['train.txt','test.txt'],
    vocab_size=10000
)
wordpiece_tokenizer.save_model('nlpbook/wordpiece')

['nlpbook/wordpiece\\vocab.txt']

## 2. 토크나이저 선언 및 토큰화

In [16]:
from transformers import BertTokenizer

tokenizer_bert = BertTokenizer.from_pretrained('nlpbook/wordpiece/',do_lower_case=False)

In [18]:
batch_inputs = tokenizer_bert(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True
)

## 3. 실행 결과
> *세가지 입력값이 만들어짐*

### input_ids -> 토큰화 결과(시작 2[CLS], 끝3[SEP])

In [19]:
batch_inputs['input_ids']

[[2, 2741, 1025, 444, 9176, 2589, 3216, 8492, 2421, 16, 3, 0],
 [2, 2087, 4893, 1073, 2325, 3, 0, 0, 0, 0, 0, 0],
 [2, 3366, 16, 16, 16, 1988, 3360, 16, 1979, 9086, 16, 3]]

### attention_mask -> 일반토큰과 패딩토큰 구분

In [20]:
batch_inputs['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
 [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

In [42]:
batch_inputs = tokenizer_bert(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True
)

In [43]:
batch_inputs['input_ids']

[[2, 2741, 1025, 444, 9176, 2589, 3216, 8492, 2421, 16, 3, 0],
 [2, 2087, 4893, 1073, 2325, 3, 0, 0, 0, 0, 0, 0],
 [2, 3366, 16, 16, 16, 1988, 3360, 16, 1979, 9086, 16, 3]]

### 세그먼트 정보(첫번째 문장인지, 두번째 문장인지)

In [27]:
batch_inputs['token_type_ids']

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]