In [None]:
from Korpora import Korpora
from konlpy.tag import Okt, Kkma
import spacy
from collections import Counter

import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import re

## 1. 데이터 불러오기

In [None]:
# NSMC 데이터 불러오기
nsmc = Korpora.load("nsmc")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /Users/anhyojun/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /Users/anhy

In [None]:
# 훈련, 테스트 데이터 분리
train_data = nsmc.train
test_data = nsmc.test

## 2. 단어 사전 생성

In [None]:
# 불용어 제거 함수
def remove_stopwords(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

# 구두점 제거 함수
def remove_punctuation(tokens):
    # match는 문장의 처음부터 매칭돼야 함
    return [token for token in tokens if re.match(r'[\w가-힇]+', token)]

def build_voca(texts, stopwords='stopword.txt', tokenizer=Okt().morphs):
    with open(stopwords, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()

    counter = Counter()
    for text in texts:
        tokens = tokenizer(text)

        # 불용어 및 구두점 제거
        clean_tokens = remove_stopwords(tokens, stopwords)
        clean_tokens = remove_punctuation(clean_tokens)

        counter.update(clean_tokens)

    vocab = {'<PAD>' : 0, '<UNK>' : 1}
    vocab.update({word : idx+2 for idx, (word, freq) in enumerate(counter.items())})

    return vocab

In [None]:
# 텍스트, 레이블 분리
texts = test_data.texts
labels = test_data.labels

In [None]:
voca = build_voca(texts)

In [None]:
print(voca)

{'<PAD>': 0, '<UNK>': 1, '굳': 2, 'ㅋ': 3, 'GDNTOPCLASSINTHECLUB': 4, '뭐': 5, '야': 6, '평점': 7, '나쁘진': 8, '않지만': 9, '10': 10, '점': 11, '짜': 12, '리': 13, '더': 14, '아니잖아': 15, '지루하지는': 16, '않은데': 17, '완전': 18, '막장': 19, '돈': 20, '주고': 21, '보기': 22, '에는': 23, '3': 24, 'D': 25, '아니었어도': 26, '별': 27, '다섯': 28, '줬을텐데': 29, '왜': 30, '나와서': 31, '제': 32, '심기': 33, '불편하게': 34, '하죠': 35, '음악': 36, '주가': 37, '최고': 38, '영화': 39, '진정한': 40, '쓰레기': 41, '미국': 42, '애니': 43, '튀어나온듯': 44, '창의력': 45, '없는': 46, '로봇': 47, '디자인': 48, '부터가': 49, '고개': 50, '젖게': 51, '한다': 52, '갈수록': 53, '개판': 54, '되가는': 55, '중국영화': 56, '유치하고': 57, '내용': 58, '없음': 59, '폼': 60, '잡다': 61, '끝남': 62, '안되는': 63, '무기': 64, '유치한': 65, 'cg': 66, '남무': 67, '그립다': 68, '동사서독': 69, '같은': 70, '류': 71, '류작': 72, '이별': 73, '아픔': 74, '뒤': 75, '찾아오는': 76, '새로운': 77, '인연': 78, '기쁨': 79, 'But': 80, '모든': 81, '사람': 82, '그렇지는': 83, '않네': 84, '괜찮네요': 85, '오랜': 86, '포켓몬스터': 87, '잼밌': 88, '한국': 89, '독립영화': 90, '한계': 91, '그렇게': 92, '아버지': 93, '된다와': 94, '

## 3. 토큰 텐서화

In [None]:
def make_tensor_token(texts, stopwords='stopword.txt'):
    tensor_list = []

    with open(stopwords, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()

    for text in texts:
        tokens = Okt().morphs(text)

        # 불용어 및 구두점 제거
        clean_tokens = remove_stopwords(tokens, stopwords)
        clean_tokens = remove_punctuation(clean_tokens)

        # voca에 토큰이 있으면 해당 인덱스값, 없으면 <UNK> 인덱스값
        indexed_token = [voca[token] if token in voca else voca['<UNK>'] for token in clean_tokens]
        tensor_token = torch.tensor(indexed_token, dtype=torch.long)
        tensor_list.append(tensor_token)

    return tensor_list

In [None]:
tensor_token_list = make_tensor_token(texts)
tensor_token_list

[tensor([2, 3]),
 tensor([4]),
 tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
 tensor([16, 17, 18, 19, 20, 21, 22, 23]),
 tensor([24, 25, 26, 27, 28, 29, 30, 24, 25, 31, 32, 33, 34, 35]),
 tensor([36, 37, 38, 36, 39]),
 tensor([40, 41]),
 tensor([42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]),
 tensor([53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
         39, 24, 71, 72]),
 tensor([73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]),
 tensor([85, 86, 87, 88]),
 tensor([89, 90, 91, 92, 93, 94, 95]),
 tensor([ 96,  97,  98,  99, 100, 101, 102,  98, 103, 104, 105, 106, 107,  70,
         108,  39]),
 tensor([109, 110, 111, 112,  39, 113, 114, 115]),
 tensor([116, 117, 118, 119,  39, 120, 121, 122,  39, 123,  39, 124, 125, 126,
          39, 127, 128, 129, 130,  39, 131,  70, 132, 133]),
 tensor([134,   7,   5]),
 tensor([38]),
 tensor([135, 136, 137, 138, 139, 140, 118, 137, 141, 142, 137, 143]),
 tensor([144]),
 tensor([ 27, 145, 146, 147, 148, 149, 150, 151

## 4. 토큰 패딩

In [None]:
# 텐서 토큰을 패딩해서 텐서화 해주는 함수
def pad_sequence(tensor_token, max_length, padding_token=0, cut_front=False):
    # 뒷부분을 자를 경우
    if cut_front == False:
        # 토큰 길이가 max_length보다 짧은 경우
        if len(tensor_token) < max_length:
            tensor_pad = torch.tensor([padding_token] * (max_length - len(tensor_token)))
            padded_token = torch.cat((tensor_token, tensor_pad))
            return padded_token
        # 토큰 길이가 max_length보다 긴 경우
        else:
            return tensor_token[:max_length]
    elif cut_front == True:
        # 토큰 길이가 max_length보다 짧은 경우
        if len(tensor_token) < max_length:
            tensor_pad = torch.tensor([padding_token] * (max_length - len(tensor_token)))
            padded_token = torch.cat((tensor_pad, tensor_token))
            return padded_token
        else:
            return tensor_token[(max_length - len(tensor_token)):]

In [None]:
padded_token_list = []
for token in tensor_token_list:
    padded_token_list.append(pad_sequence(token, 10))
padded_token_tensor = torch.stack(padded_token_list)

In [None]:
padded_token_tensor

tensor([[    2,     3,     0,  ...,     0,     0,     0],
        [    4,     0,     0,  ...,     0,     0,     0],
        [    5,     6,     7,  ...,    12,    13,    14],
        ...,
        [  521,   429,  4765,  ..., 55673,  3609,     0],
        [ 3373,  1410,  3580,  ...,   744,  2067,  4949],
        [ 2145,   460,    30,  ...,     0,     0,     0]])

## 5. 커스텀 데이터셋 생성

In [None]:
class TextDataset(Dataset):

    def __init__(self, padded_token_tensor_texts, labels):
        self.texts = padded_token_tensor_texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tensor_text = self.texts[idx]
        tensor_label = torch.tensor(self.labels[idx], dtype=torch.long)
        return tensor_text, tensor_label

In [None]:
DS = TextDataset(padded_token_tensor, labels)

In [None]:
DL = DataLoader(DS, batch_size = 10)

In [None]:
for text, label in DL:
    print(text, label)
    break

tensor([[ 2,  3,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
        [16, 17, 18, 19, 20, 21, 22, 23,  0,  0],
        [24, 25, 26, 27, 28, 29, 30, 24, 25, 31],
        [36, 37, 38, 36, 39,  0,  0,  0,  0,  0],
        [40, 41,  0,  0,  0,  0,  0,  0,  0,  0],
        [42, 43, 44, 45, 46, 47, 48, 49, 50, 51],
        [53, 54, 55, 56, 57, 58, 59, 60, 61, 62],
        [73, 74, 75, 76, 77, 78, 79, 80, 81, 82]]) tensor([1, 0, 0, 0, 0, 1, 0, 0, 0, 1])


In [None]:
DL.dataset[0][0]

tensor([2, 3, 0, 0, 0, 0, 0, 0, 0, 0])