# Basic concept of torchtext
`Torchtext` takes a declarative approach to loading its data
* reference: [A Comprehensive Introduction to Torchtext](http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/)

![Alt text](https://i0.wp.com/mlexplained.com/wp-content/uploads/2018/02/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%83%E3%83%88-2018-02-07-10.32.59.png?resize=1024%2C481)

### Setup

In [1]:
import pandas as pd
from pathlib import Path
from pprint import pprint
from mecab import MeCab

In [2]:
data_dir = Path().cwd() / 'data'
nsmc = pd.read_csv(next(data_dir.iterdir()), sep='\t')

In [3]:
nsmc.head()

Unnamed: 0,document,label
0,애들 욕하지마라 지들은 뭐 그렇게 잘났나? 솔까 거기 나오는 귀여운 애들이 당신들보...,1
1,여전히 반복되고 있는 80년대 한국 멜로 영화의 유치함.,0
2,쉐임리스 스티브와 피오나가 손오공 부르마로 ㅋㅋㅋ,0
3,0점은 없나요?...,0
4,제발 시즌2 ㅜㅜ,1


### Field

In [4]:
from torchtext.data import Field

sentence = Field(sequential=True, use_vocab=True, tokenize=MeCab().morphs, batch_first=True, fix_length=32)
label = Field(sequential=False, use_vocab=False, batch_first=True, unk_token=None, pad_token=None, is_target=True)

In [5]:
print(sentence.pad_token, label.pad_token)
print(sentence.unk_token, label.unk_token)
print(sentence.eos_token, label.eos_token)
print(sentence.init_token, label.init_token)

<pad> None
<unk> None
None None
None None


In [6]:
example_sentence = nsmc.iloc[0]['document']
print(example_sentence)

애들 욕하지마라 지들은 뭐 그렇게 잘났나? 솔까 거기 나오는 귀여운 애들이 당신들보다 훨 낮다.


In [7]:
list_of_tokens = sentence.tokenize(example_sentence)
print(list_of_tokens)

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']


### Vocab

In [8]:
import itertools
from collections import Counter
from torchtext.vocab import Vocab, build_vocab_from_iterator

In [9]:
list_of_tokenized = nsmc['document'].apply(sentence.tokenize).tolist()

In [10]:
count_tokens = Counter(itertools.chain.from_iterable(list_of_tokenized))

In [11]:
vocab = Vocab(counter=count_tokens, min_freq=10)
sentence.vocab = vocab

In [12]:
print(list_of_tokens)
print(sentence.vocab.itos[:5], len(sentence.vocab))
print(sentence.numericalize([list_of_tokens]))

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']
['<unk>', '<pad>', '.', '이', '는'] 8775
tensor([[ 178,   20,  427,    8,   22,  289,  106,   22,   20,   12,   99,  623,
           65,  683,   28,   26, 2402,  664,  833,  130,    4, 1161,  178,   20,
            3,  753,   20,  105, 1278,  261,    6,    2]])


In [13]:
sentence.vocab = None # reset

In [14]:
sentence.build_vocab(list_of_tokenized, min_freq=10)

In [15]:
print(list_of_tokens)
print(sentence.vocab.itos[:5], len(sentence.vocab))
print(sentence.numericalize([list_of_tokens]))

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']
['<unk>', '<pad>', '.', '이', '는'] 8775
tensor([[ 178,   20,  427,    8,   22,  289,  106,   22,   20,   12,   99,  623,
           65,  683,   28,   26, 2402,  664,  833,  130,    4, 1161,  178,   20,
            3,  753,   20,  105, 1278,  261,    6,    2]])


### Dataset

In [16]:
from torchtext.data import Dataset, Example, TabularDataset

In [17]:
# generate an Example
example = Example.fromlist(nsmc.iloc[0].tolist(), fields=[('document', sentence), ('label', label)])
print(example.document, example.label)

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.'] 1


In [18]:
# generate list of Examples
list_of_examples = [Example.fromlist(row.tolist(),
                    fields=[('document', sentence), ('label', label)]) for _, row in nsmc.iterrows()]

In [19]:
pprint(list_of_examples[:5])
print(list_of_examples[0].document, list_of_examples[0].label)

[<torchtext.data.example.Example object at 0x7efb6990b358>,
 <torchtext.data.example.Example object at 0x7efb6990b2e8>,
 <torchtext.data.example.Example object at 0x7efb6990b8d0>,
 <torchtext.data.example.Example object at 0x7efb6990b240>,
 <torchtext.data.example.Example object at 0x7efb6990b208>]
['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.'] 1


In [20]:
# generate dataset
dataset = Dataset(examples=list_of_examples, fields=[('document', sentence), ('label', label)])

In [21]:
dataset.examples[:5]

[<torchtext.data.example.Example at 0x7efb6990b358>,
 <torchtext.data.example.Example at 0x7efb6990b2e8>,
 <torchtext.data.example.Example at 0x7efb6990b8d0>,
 <torchtext.data.example.Example at 0x7efb6990b240>,
 <torchtext.data.example.Example at 0x7efb6990b208>]

In [22]:
dataset.fields

{'document': <torchtext.data.field.Field at 0x7efbc741bef0>,
 'label': <torchtext.data.field.Field at 0x7efbc741be80>}

In [23]:
# using TabularDataset
sentence = Field(sequential=True, use_vocab=True, tokenize=MeCab().morphs, batch_first=True, fix_length=32)
label = Field(sequential=False, use_vocab=False, batch_first=True, unk_token=None, pad_token=None, is_target=True)

dataset = TabularDataset(path='data/train.txt', format='TSV', fields=[('document', sentence), ('label', label)],
                         skip_header=True)

In [24]:
sentence.build_vocab(dataset)

### Iterator

In [25]:
from torchtext.data import Iterator

In [26]:
iterator = Iterator(dataset, batch_size=2)

In [27]:
x_mb, y_mb = next(iter(iterator))

In [28]:
print(x_mb, y_mb)

tensor([[14141,     2,    17,   880,     6,   259,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [   32,    43,     7,    67,   238,    10,   238,    14,    92,   628,
             4,   457,     6,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]]) tensor([0, 1])
