In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 13.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 58.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 43.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=4bbbcbcb86

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy

import transformers
from transformers import BertTokenizer, BertModel

print(f'torch : {torch.__version__}  |  torchtext : {torchtext.__version__}  |  spacy : {spacy.__version__}  |  np : {np.__version__}  |  transformers : {transformers.__version__}')

torch : 1.8.1+cu101  |  torchtext : 0.9.1  |  spacy : 2.2.4  |  np : 1.19.5  |  transformers : 4.4.2


## Preparing Data

In [3]:
# download & check pre-trained BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(type(tokenizer))
print(tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…


<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>
PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [4]:
sent =  'hello World How ARE yoU?'

# tokenize sample sentence
tokens = tokenizer.tokenize(sent)
print(type(tokens))
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))
print()

# convert sample sentence into BatchEncoding
batch_indices = tokenizer(sent)
print(type(batch_indices))
print(batch_indices)
print(tokenizer.decode(batch_indices['input_ids']))

<class 'list'>
['hello', 'world', 'how', 'are', 'you', '?']
[7592, 2088, 2129, 2024, 2017, 1029]

<class 'transformers.tokenization_utils_base.BatchEncoding'>
{'input_ids': [101, 7592, 2088, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] hello world how are you? [SEP]


In [5]:
# save special tokens
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

# save special token ids
cls_token_id = tokenizer.convert_tokens_to_ids(cls_token)
sep_token_id = tokenizer.convert_tokens_to_ids(sep_token)
pad_token_id = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_id = tokenizer.convert_tokens_to_ids(unk_token)

# check each special token & id
for tk in ['cls', 'sep', 'pad', 'unk']:
  print(eval(tk+'_token'), '-', eval(tk+'_token_id'), end='\t')

[CLS] - 101	[SEP] - 102	[PAD] - 0	[UNK] - 100	

In [6]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print('max_input_length :', max_input_length)


def tokenize_and_cut(sentence, maxlen=max_input_length-2):
  tokens = tokenizer.tokenize(sentence)
  tokens = tokens[:maxlen]
  return tokens

max_input_length : 512


In [7]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids,
    batch_first=True, use_vocab=False, 
    init_token=cls_token, eos_token=sep_token_id, pad_token=pad_token_id, unk_token=unk_token_id
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:10<00:00, 7.74MB/s]


CPU times: user 6min 42s, sys: 11.8 s, total: 6min 54s
Wall time: 7min 6s


In [8]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data[6]
tmp_dict = vars(tmp_ex)

print('< sample data >')
print('>>> type :', type(tmp_ex))
print('>>> length :', len(tmp_dict['text']))
print('>>> tokens :', tokenizer.convert_ids_to_tokens(tmp_dict['text'][:50]))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key][:50]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< sample data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> length : 322
>>> tokens : ['although', 'nothing', 'can', 'compare', 'to', 'vampires', 'vs', '.', 'zombies', '.', '.', '.', 'in', 'any', 'realm', 'of', 'film', 'making', 'i', 'will', 'attempt', 'to', 'judge', 'this', 'movie', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'firstly', ',', 'the', 'special', 'effects', 'were', 'breath', '-', 'taking', '.', 'when', 'there', 'was', 'an', 'explosion', 'on']
>>> text : [2348, 2498, 2064, 12826, 2000, 6144, 5443, 1012, 14106, 1012, 1012, 1012, 1999, 2151, 8391, 1997, 2143, 2437, 1045, 2097, 3535, 2000, 3648, 2023, 3185, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 15847, 1010, 1996, 2569, 3896, 2020, 3052, 1011, 2635, 1012, 2043, 2045, 2001, 2019, 7738, 2006]
>>> label : neg


In [9]:
# %%time
# build vocabulary (only LABEL)
LABEL.build_vocab(train_data)
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(LABEL.vocab.stoi)

>>> Unique tokens in LABEL vocabulary: 2
defaultdict(None, {'neg': 0, 'pos': 1})


In [10]:
# create the iterators
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cpu')

torchtext.legacy.data.iterator.BucketIterator

137

136.71875