# Basic concept of torchtext
`Torchtext` takes a declarative approach to loading its data
* reference: [A Comprehensive Introduction to Torchtext](http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/)

![Alt text](https://i0.wp.com/mlexplained.com/wp-content/uploads/2018/02/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B7%E3%83%A7%E3%83%83%E3%83%88-2018-02-07-10.32.59.png?resize=1024%2C481)

### Setup

In [1]:
import pandas as pd
from pathlib import Path
from pprint import pprint
from mecab import MeCab

In [2]:
data_dir = Path().cwd() / 'data'
nsmc = pd.read_csv(next(data_dir.iterdir()), sep='\t')

In [3]:
nsmc.head()

Unnamed: 0,document,label
0,애들 욕하지마라 지들은 뭐 그렇게 잘났나? 솔까 거기 나오는 귀여운 애들이 당신들보...,1
1,여전히 반복되고 있는 80년대 한국 멜로 영화의 유치함.,0
2,쉐임리스 스티브와 피오나가 손오공 부르마로 ㅋㅋㅋ,0
3,0점은 없나요?...,0
4,제발 시즌2 ㅜㅜ,1


### Field
- api guide : https://torchtext.readthedocs.io/en/latest/data.html#field

In [4]:
from torchtext.data import Field

sentence_field = Field(sequential=True, use_vocab=True, tokenize=MeCab().morphs, batch_first=True, fix_length=32)
label_field = Field(sequential=False, use_vocab=False, batch_first=True, unk_token=None, pad_token=None,
                    is_target=True)

In [5]:
dir(sentence_field)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'batch_first',
 'build_vocab',
 'dtype',
 'dtypes',
 'eos_token',
 'fix_length',
 'ignore',
 'include_lengths',
 'init_token',
 'is_target',
 'lower',
 'numericalize',
 'pad',
 'pad_first',
 'pad_token',
 'postprocessing',
 'preprocess',
 'preprocessing',
 'process',
 'sequential',
 'stop_words',
 'tokenize',
 'tokenizer_args',
 'truncate_first',
 'unk_token',
 'use_vocab',
 'vocab_cls']

In [6]:
print(sentence_field.pad_token, label_field.pad_token)
print(sentence_field.unk_token, label_field.unk_token)
print(sentence_field.eos_token, label_field.eos_token)
print(sentence_field.init_token, label_field.init_token)

<pad> None
<unk> None
None None
None None


In [7]:
example_sentence, example_label = nsmc.iloc[0].tolist()
print(example_sentence, example_label)

애들 욕하지마라 지들은 뭐 그렇게 잘났나? 솔까 거기 나오는 귀여운 애들이 당신들보다 훨 낮다. 1


In [8]:
list_of_tokens = sentence_field.tokenize(example_sentence)
print(list_of_tokens)

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']


In [9]:
print(sentence_field.sequential, sentence_field.preprocess(example_sentence))
print(label_field.sequential, label_field.preprocess(example_label))

True ['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']
False 1


### Vocab
* api guide : https://torchtext.readthedocs.io/en/latest/vocab.html#vocab

In [10]:
import itertools
from collections import Counter
from torchtext.vocab import Vocab

In [11]:
list_of_tokenized = nsmc['document'].apply(sentence_field.tokenize).tolist()
print(list_of_tokenized[:2])

[['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.'], ['여전히', '반복', '되', '고', '있', '는', '80', '년', '대', '한국', '멜', '로', '영화', '의', '유치', '함', '.']]


In [12]:
count_tokens = Counter(itertools.chain.from_iterable(list_of_tokenized))

In [13]:
vocab = Vocab(counter=count_tokens, min_freq=10)
sentence_field.vocab = vocab

In [14]:
dir(vocab)

['UNK',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_default_unk_index',
 'extend',
 'freqs',
 'itos',
 'load_vectors',
 'set_vectors',
 'stoi',
 'unk_index',
 'vectors']

In [15]:
print(list_of_tokens)
print(sentence_field.vocab.itos[:5], len(sentence_field.vocab))
print(sentence_field.numericalize([list_of_tokens]))

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']
['<unk>', '<pad>', '.', '이', '는'] 8775
tensor([[ 178,   20,  427,    8,   22,  289,  106,   22,   20,   12,   99,  623,
           65,  683,   28,   26, 2402,  664,  833,  130,    4, 1161,  178,   20,
            3,  753,   20,  105, 1278,  261,    6,    2]])


In [16]:
sentence_field.vocab = None # reset

In [17]:
sentence_field.build_vocab(list_of_tokenized, min_freq=10)

In [18]:
print(list_of_tokens)
print(sentence_field.vocab.itos[:5], len(sentence_field.vocab))
print(sentence_field.numericalize([list_of_tokens]))

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.']
['<unk>', '<pad>', '.', '이', '는'] 8775
tensor([[ 178,   20,  427,    8,   22,  289,  106,   22,   20,   12,   99,  623,
           65,  683,   28,   26, 2402,  664,  833,  130,    4, 1161,  178,   20,
            3,  753,   20,  105, 1278,  261,    6,    2]])


### Example
* api guide : https://torchtext.readthedocs.io/en/latest/data.html#example

In [19]:
from torchtext.data import Example

# generate an Example
example = Example.fromlist(nsmc.iloc[0].tolist(), fields=[('document', sentence_field), ('label', label_field)])
print(example.document, example.label)

['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.'] 1


In [20]:
dir(example)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'document',
 'fromCSV',
 'fromJSON',
 'fromdict',
 'fromlist',
 'fromtree',
 'label']

### Dataset
* api guide : https://torchtext.readthedocs.io/en/latest/data.html#dataset

In [21]:
from torchtext.data import Dataset

In [22]:
# generate list of Examples
list_of_examples = [Example.fromlist(row.tolist(),
                    fields=[('document', sentence_field), ('label', label_field)]) for _, row in nsmc.iterrows()]

In [23]:
pprint(list_of_examples[:5])
print(list_of_examples[0].document, list_of_examples[0].label)

[<torchtext.data.example.Example object at 0x7f808fc8a978>,
 <torchtext.data.example.Example object at 0x7f808fc8a208>,
 <torchtext.data.example.Example object at 0x7f808fc8a9b0>,
 <torchtext.data.example.Example object at 0x7f808fc8a9e8>,
 <torchtext.data.example.Example object at 0x7f808fc8aa20>]
['애', '들', '욕', '하', '지', '마', '라', '지', '들', '은', '뭐', '그렇게', '잘', '났', '나', '?', '솔', '까', '거기', '나오', '는', '귀여운', '애', '들', '이', '당신', '들', '보다', '훨', '낮', '다', '.'] 1


In [24]:
# generate dataset
dataset = Dataset(examples=list_of_examples, fields=[('document', sentence_field), ('label', label_field)])

In [25]:
dataset.examples[:5]

[<torchtext.data.example.Example at 0x7f808fc8a978>,
 <torchtext.data.example.Example at 0x7f808fc8a208>,
 <torchtext.data.example.Example at 0x7f808fc8a9b0>,
 <torchtext.data.example.Example at 0x7f808fc8a9e8>,
 <torchtext.data.example.Example at 0x7f808fc8aa20>]

In [26]:
dataset.fields

{'document': <torchtext.data.field.Field at 0x7f80eefe7b00>,
 'label': <torchtext.data.field.Field at 0x7f80eefe7a90>}

In [27]:
dir(dataset)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'download',
 'examples',
 'fields',
 'filter_examples',
 'sort_key',
 'split',
 'splits']

### TabularDataset
* api guide : https://torchtext.readthedocs.io/en/latest/data.html#tabulardataset

In [28]:
from torchtext.data import TabularDataset

# using TabularDataset
sentence_field = Field(sequential=True, use_vocab=True, tokenize=MeCab().morphs, batch_first=True, fix_length=32)
label_field = Field(sequential=False, use_vocab=False, batch_first=True, unk_token=None, pad_token=None,
                    is_target=True)

dataset = TabularDataset(path='data/train.txt', format='TSV', fields=[('document', sentence_field),
                                                                      ('label', label_field)], skip_header=True)

In [29]:
sentence_field.build_vocab(dataset)

### Batch
* api guide : https://torchtext.readthedocs.io/en/latest/data.html#batch

In [30]:
from torchtext.data import Batch

In [31]:
example_mb = Batch(data=dataset.examples[:32], dataset=dataset)

In [32]:
example_mb


[torchtext.data.batch.Batch of size 32]
	[.document]:[torch.LongTensor of size 32x32]
	[.label]:[torch.LongTensor of size 32]

In [33]:
dir(example_mb)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_field_values',
 'batch_size',
 'dataset',
 'document',
 'fields',
 'fromvars',
 'input_fields',
 'label',
 'target_fields']

In [34]:
print(example_mb.batch_size)
print(example_mb.fields)
print(example_mb.input_fields, example_mb.target_fields)
print(example_mb.document, example_mb.label)

32
dict_keys(['document', 'label'])
['document'] ['label']
tensor([[ 178,   20,  427,  ...,  261,    6,    2],
        [1312, 1468,   57,  ...,    1,    1,    1],
        [5697,  141, 7411,  ...,    1,    1,    1],
        ...,
        [  48,   48,  114,  ...,    1,    1,    1],
        [  77,  164,   10,  ...,    1,    1,    1],
        [ 168,    1,    1,  ...,    1,    1,    1]]) tensor([1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 1, 0, 0, 1, 0])


### Iterator
* api guide : https://torchtext.readthedocs.io/en/latest/data.html#iterator

In [35]:
from torchtext.data import Iterator

In [36]:
iterator = Iterator(dataset, batch_size=2, shuffle=True)

In [37]:
dir(iterator)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_iterations_this_epoch',
 '_random_state_this_epoch',
 '_restored_from_state',
 'batch_size',
 'batch_size_fn',
 'create_batches',
 'data',
 'dataset',
 'device',
 'epoch',
 'init_epoch',
 'iterations',
 'load_state_dict',
 'random_shuffler',
 'repeat',
 'shuffle',
 'sort',
 'sort_key',
 'sort_within_batch',
 'splits',
 'state_dict',
 'train']

In [38]:
x_mb, y_mb = next(iter(iterator))

In [39]:
print(x_mb, y_mb)

tensor([[ 3103,  2134,    50,   115,   782,   812,  4570,    11,   127,    15,
             4,    40,     9,  1477,   489,     2,     2,     3,  3673,     9,
            35,   326,    40,    46,   158,     3,  4133,    14,  1277,  4843,
           891,     2],
        [   67,    11, 33990,  1493,   319,  5346,  9831,     4,   117,   353,
            78,     6,   586,   832,  5020,   195,   508,   755,    14,   960,
         36372,   262,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]]) tensor([0, 0])
