## torchtext - korean

In [1]:
!pip install python-mecab-ko

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import urllib.request
import pandas as pd

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7fcfa1890b20>)

In [4]:
train_df = pd.read_table('ratings_train.txt')
test_df = pd.read_table('ratings_test.txt')

In [5]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [6]:
train_df.dropna(inplace=True)

In [7]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [8]:
test_df.dropna(inplace=True)

In [9]:
print('훈련 데이터 샘플의 개수 : {}'.format(len(train_df)))
print('테스트 데이터 샘플의 개수 : {}'.format(len(test_df)))

훈련 데이터 샘플의 개수 : 149995
테스트 데이터 샘플의 개수 : 49997


In [10]:
from torchtext import data # torchtext.data 임포트
from mecab import MeCab



In [11]:
# Mecab을 토크나이저로 사용

def tokenizer(text):
    mecab = MeCab()
    tokens = mecab.morphs(text)
    return tokens

In [12]:
from typing import List, Tuple

def yield_tokens(data_iter: pd.Series) -> List[str]:
    for text in data_iter:
        yield tokenizer(text)

In [13]:
from torchtext.vocab import build_vocab_from_iterator

text_vocab = build_vocab_from_iterator(yield_tokens(train_df['document']), specials=['<unk>'])
text_vocab.set_default_index(text_vocab['<unk>'])



In [14]:
import os
import random
import numpy as np
import torch
import torchvision

In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [16]:
%matplotlib inline

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

In [17]:
import pandas as pd
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple

class NSMCDataset(Dataset):
    def __init__(self, df: pd.DataFrame, text_vocab):
        self.df = df
        self.text_vocab = text_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item  =self.df.iloc[idx]
        return {
            'id': item['id'],
            'document': torch.tensor([self.text_vocab[token] for token in tokenizer(item['document'])]),
            'label': torch.tensor(int(item['label']))
        }  

In [18]:
from torch.utils.data import random_split

train_data = NSMCDataset(train_df, text_vocab)
test_data = NSMCDataset(test_df, text_vocab)

In [19]:
print('훈련 샘플의 개수 : {}'.format(len(train_data)))
print('테스트 샘플의 개수 : {}'.format(len(test_data)))

훈련 샘플의 개수 : 149995
테스트 샘플의 개수 : 49997


In [20]:
print(train_data[0])

{'id': 9976970, 'document': tensor([ 40,  85, 938,   1,   1,  47, 243,  27,  41, 766]), 'label': tensor(0)}


In [21]:
print('단어 집합의 크기 : {}'.format(len(text_vocab)))

단어 집합의 크기 : 53980


In [22]:
print(text_vocab.get_stoi())



In [23]:
print(text_vocab.get_itos())



In [24]:
batch_size = 5

In [25]:
def collate_batch(batch):
    label_list, text_list = [], []
    for item in batch:
        label_list.append(item['label'])
        text_list.append(item['document'])
    
    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=text_vocab['<pad>'])
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return text_list, label_list


In [26]:
train_loader = DataLoader(dataset=train_data, batch_size = batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(dataset=test_data, batch_size = batch_size, shuffle=True, collate_fn=collate_batch)

In [27]:
print('훈련 데이터의 미니 배치 수 : {}'.format(len(train_loader)))
print('테스트 데이터의 미니 배치 수 : {}'.format(len(test_loader)))

훈련 데이터의 미니 배치 수 : 29999
테스트 데이터의 미니 배치 수 : 10000


In [28]:
for document, label in train_loader:
    print(document)
    print(label)
    break

tensor([[ 3542,     7,    17,  1997,    43,    66,     3,  1161,    15,   613,
          2225,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [   68,    14,     6,    32,   157,   305,    15,     4,     1,  6091,
          2418, 11686,    72,     2,     4,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  105,    24,    79,     1,   114,    79,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     

In [29]:
print(document.shape)
print(label.shape)

torch.Size([5, 46])
torch.Size([5])
