# Visual Guide to Using Bert for the First Time

source: http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [4]:
df.shape

(6920, 2)

In [6]:
df = df[0:100]

In [7]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [8]:
tokenizer

<transformers.tokenization_distilbert.DistilBertTokenizer at 0x7f6989a112b0>

In [9]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [10]:
# tokenized = df[0].apply((lambda x: tokenizer.encode(x, 
#                         add_special_tokens=True)))

In [11]:
# tokenized.shape

In [12]:
# for item in tokenized[0:30]:
#     print(len(item))

In [13]:
tokenized2 = tokenizer(list(df[0]),padding=True, 
                                            truncation=True, 
                                            return_tensors='pt')

In [14]:
type(tokenized2)

transformers.tokenization_utils_base.BatchEncoding

In [15]:
tokenized2.keys()

dict_keys(['input_ids', 'attention_mask'])

In [16]:
tokenized2["input_ids"].shape

torch.Size([100, 54])

In [17]:
len(tokenized2["input_ids"][0])

54

In [18]:
type(tokenized2["input_ids"][0])

torch.Tensor

In [19]:
# max_len = 0
# for i in tokenized.values:
#     if len(i) > max_len:
#         max_len = len(i)

# padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [20]:
# np.array(padded).shape

In [21]:
# input_ids = torch.tensor(np.array(padded))


In [22]:
# input_ids.shape

In [23]:
# input_ids == tokenized2["input_ids"]

In [24]:
input_ids = tokenized2["input_ids"]

In [25]:

# attention_mask = np.where(padded != 0, 1, 0)
# attention_mask.shape

In [26]:
tokenized2["attention_mask"].shape

torch.Size([100, 54])

In [27]:
# attention_mask == tokenized2["attention_mask"]

In [28]:
with torch.no_grad():
    last_hidden_states = model(input_ids)   # or use the o

In [31]:
type(last_hidden_states)

tuple

In [32]:
len(last_hidden_states)

1

In [33]:
type(last_hidden_states[0])

torch.Tensor

In [34]:
last_hidden_states[0].shape

torch.Size([100, 54, 768])

That's number of examples (100), max number of tokens, and number of hidden units in the BERT model. We want to keep all 100 examples, but we want only the first token, `CLS`, and its embedding (768-long).

In [35]:
# How to extract the CLS token.
features = last_hidden_states[0][:,0,:].numpy()

In [36]:
features.shape

(100, 768)

In [37]:
labels = df[1]
labels.shape

(100,)

In [38]:
train_features = features[0:80]
train_labels = labels[0:80]

lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [39]:
test_features = features[81:]
test_labels = labels[81:]
lr_clf.score(test_features, test_labels)

0.5263157894736842