In [4]:
#import tensorflow as tf
import pandas as pd
#from pandas import ExcelWriter
#from pandas import ExcelFile
from transformers import BertForSequenceClassification,BertModel, BertConfig
import torch
from transformers import BertTokenizer
import numpy as np

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce MX150


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
## set the padding
tokenizer.pad_token = '[PAD]'
tokenizer.pad_token_id

100%|██████████| 231508/231508 [00:00<00:00, 977335.36B/s]


0

In [11]:
## bert encoder  -- set output_hidden_states to be TRUE
encoder = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)


In [12]:
encoder.config

{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": true,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

## pading and truncation // mask

In [14]:
from keras.preprocessing.sequence import pad_sequences


In [15]:
def preprocess_data(tokenizer, sentences, MAX_LEN = 256):
    """
    :params[in]: tokenizer, the configured tokenizer
    :params[in]: sentences, list of strings
    """
    # 1. Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    
    # For every sentence...
    for sent in sentences:
        # `encode` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        encoded_sent = tokenizer.encode(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            # This function also supports truncation and conversion
                            # to pytorch tensors, but we need to do padding, so we
                            # can't use these features :( .
                            #max_length = 128,          # Truncate all sentences.
                            #return_tensors = 'pt',     # Return pytorch tensors.
                       )        
        # Add the encoded sentence to the list.
        input_ids.append(encoded_sent)
    
    # We'll borrow the `pad_sequences` utility function to do this.
    from keras.preprocessing.sequence import pad_sequences
    
    # Set the maximum sequence length.
    # maximum training sentence length of 87...
    
    print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
    
    print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
    
    # Pad our input tokens with value 0.
    # "post" indicates that we want to pad and truncate at the end of the sequence,
    # as opposed to the beginning.
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                              value=0, truncating="post", padding="post")
    
    print('\nDone.')
    # Create attention masks
    attention_masks = []
    # For each sentence...
    for sent in input_ids:
        
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        
        # Store the attention mask for this sentence.
        attention_masks.append(att_mask)
    return input_ids, attention_masks


In [16]:
### an example
sentences = ['I am working at Duke university', 'Duke is at Durham, North carolina']

In [17]:
input_ids,attention_masks=preprocess_data(tokenizer, sentences, MAX_LEN = 256)


Padding/truncating all sentences to 256 values...

Padding token: "[PAD]", ID: 0

Done.


In [18]:
input_ids

array([[ 101, 1045, 2572, 2551, 2012, 3804, 2118,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [28]:
train_inputs = torch.LongTensor(input_ids)
train_masks = torch.LongTensor(attention_masks)


In [31]:
out=encoder(train_inputs)

In [32]:
help(encoder)

Help on BertModel in module transformers.modeling_bert object:

class BertModel(BertPreTrainedModel)
 |  BertModel(config)
 |  
 |  The bare Bert Model transformer outputting raw hidden-states without any specific head on top.    The BERT model was proposed in
 |  `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
 |  by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
 |  pre-trained using a combination of masked language modeling objective and next sentence prediction
 |  on a large corpus comprising the Toronto Book Corpus and Wikipedia.
 |  
 |  This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
 |  refer to the PyTorch documentation for all matter related to general usage and behavior.
 |  
 |  .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
 |      https://arxiv.org/abs/1810.04805
 |  
 |  .. _`torch.nn.Module`:
 |    




In [33]:
type(out)

tuple

In [34]:
out[0]

tensor([[[-0.3638,  0.5134,  0.5495,  ..., -0.4841,  0.4963, -0.5592],
         [ 0.2451, -0.1647,  0.3254,  ...,  0.0251,  0.6870,  0.3468],
         [-0.1145,  0.4743,  0.3670,  ..., -0.1040,  0.4522,  0.1831],
         ...,
         [ 0.4062,  0.1322,  0.8410,  ..., -0.7967,  0.3255, -1.1899],
         [ 0.2683,  0.0852,  0.9596,  ..., -0.7396,  0.2321, -1.0489],
         [ 0.4787, -0.0018,  0.9689,  ..., -0.8042,  0.3525, -1.4292]],

        [[-0.3720,  0.4194,  0.3879,  ..., -0.4801,  0.2841, -0.9154],
         [-0.5130,  0.0353,  1.1188,  ..., -0.2336,  0.7592, -0.0448],
         [-0.4853,  0.3472,  0.1404,  ..., -0.1242,  0.0666,  0.6161],
         ...,
         [ 0.5417,  0.1160,  0.8503,  ..., -0.8669,  0.0889, -1.2774],
         [ 0.4392,  0.0624,  0.9680,  ..., -0.8067,  0.0308, -1.2666],
         [ 0.6183, -0.0764,  0.9619,  ..., -0.8852,  0.1123, -1.6548]]],
       grad_fn=<NativeLayerNormBackward>)

In [35]:
out[1]

tensor([[-0.4736, -0.3670, -0.9379,  ..., -0.6772, -0.4092,  0.3748],
        [-0.2237, -0.4750, -0.9885,  ..., -0.8886, -0.3218,  0.0244]],
       grad_fn=<TanhBackward>)

In [36]:
out[2]

(tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [-3.3997e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
            8.9008e-01,  1.6575e-01],
          [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
            1.3468e+00, -6.9357e-01],
          ...,
          [ 1.2490e-01, -4.9897e-01, -1.1414e-01,  ..., -3.4521e-01,
           -5.5017e-01,  1.8253e-02],
          [ 1.6472e-01, -6.4177e-01,  3.7283e-01,  ..., -5.6678e-01,
           -1.9548e-01, -1.8885e-01],
          [ 4.3010e-01, -8.7304e-01,  2.0456e-01,  ..., -2.8683e-01,
           -7.8286e-01, -3.8871e-01]],
 
         [[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
            3.8253e-02,  1.6400e-01],
          [ 2.6300e-01,  9.6157e-02, -7.1425e-01,  ..., -6.0576e-01,
           -1.0222e+00, -3.7969e-02],
          [-6.2703e-01, -6.3313e-02, -3.1428e-01,  ...,  3.4265e-01,
            4.6361e-01,  4.5937e-01],
          ...,
    