In [1]:
# Question: can transformer handle a very long document?

In [2]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Move model to GPU
model.to('cuda')

In [51]:
text = 'Under President Antonio López de Santa Anna, the Mexican government began to shift away from a federalist model to a more centralized government. His increasingly dictatorial policies, including the revocation of the Constitution of 1824 in early 1835, incited federalists throughout the nation to revolt. The Mexican Army quickly put down revolts in the Mexican interior, including a brutal suppression of militias in Oaxaca and Zacatecas. Unrest continued in the Mexican state of Coahuila y Tejas. The area that bordered the United States, known as Texas, was populated primarily by English-speaking settlers, known as Texians. In October, the Texians took up arms in what became known as the Texas Revolution. The following month, Texians declared themselves part of a state independent from Coahuila and created a provisional state government based on the principles of the Constitution of 1824. By the end of the year, all Mexican troops had been expelled from Texas. Leading federalists in Mexico advocated a plan to attack centralist troops in Matamoros. Members of the General Council, the interim Texas governing body, were enamored with the idea of a Matamoros Expedition. They hoped it would inspire other federalist states to revolt and keep the bored Texian troops from deserting the army. Most importantly, it would move the war zone outside Texas. The Council officially approved the plan on December 25, and on December 30 Frank W. Johnson, the commander of the volunteer army, and his aide James Grant took the bulk of the army and almost all of the supplies to Goliad to prepare for the expedition. Determined to quash the rebellion, Santa Anna began assembling a large force to restore order; by the end of 1835, his army numbered 6,019 soldiers. In late December, at his behest, Congress passed the Tornel Decree, declaring that any foreigners fighting against Mexican troops "will be deemed pirates and dealt with as such, being citizens of no nation presently at war with the Republic and fighting under no recognized flag". In the early nineteenth century, captured pirates were executed immediately. The resolution thus gave the Mexican Army permission to take no prisoners in the war against the Texians. Santa Anna personally led the bulk of his troops inland to San Antonio de Béxar and ordered General José de Urrea to lead 550 troops along the Atascocita Road toward Goliad. Urrea\'s efforts to quell the rebellion along the Texas Gulf Coast have become known as the Goliad Campaign.'
text = text

In [52]:
len(text)

2511

In [53]:
# Tokenized input
tokenized_text = tokenizer.tokenize(text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

In [58]:
len(tokenized_text)

508

In [56]:
len(set(text))

64

In [55]:
len(indexed_tokens)

508

In [24]:
# Put data on GPU
tokens_tensor = tokens_tensor.to('cuda')

In [25]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor)
# We have a hidden states for each of the 12 layers in model bert-base-uncased

In [26]:
len(encoded_layers)

12

In [29]:
encoded_layers[0].shape

torch.Size([1, 508, 768])

In [30]:
encoded_layers[0][0]

tensor([[ 0.0103,  0.5248, -0.2999,  ...,  0.3151,  0.0445,  0.1972],
        [ 0.1347,  0.5342, -1.7163,  ..., -0.5347,  0.2544,  0.0442],
        [-0.3892, -0.3067, -0.7275,  ..., -0.2215,  0.8551, -0.2032],
        ...,
        [-0.4019,  0.3912,  0.6176,  ..., -0.9811,  0.3358,  0.1371],
        [ 0.5240,  1.4358, -1.8185,  ...,  0.9432, -0.0852,  0.7728],
        [-0.7077,  0.7240,  0.0624,  ...,  0.0339, -0.1256,  0.1096]],
       device='cuda:0')

In [27]:
encoded_layers[-1].shape

torch.Size([1, 508, 768])