In [19]:
import os
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

from bert.tokenization import FullTokenizer

In [36]:
BERT_MODEL_URL = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
MAX_SEQ_LEN = 512

In [37]:

bert_model = hub.Module(
    bert_path,
    trainable=False,
    name="bert_module"
)

In [29]:
bert_inputs = dict(
  input_ids=[2034, 2017, 2342, 2079, 2079, 2009],
  input_mask=input_mask,
  segment_ids=segment_ids
)
    
bert_model(
  inputs=bert_inputs,
  signature="tokens",
  as_dict=True
)

TypeError: Cannot convert dict_inputs: missing ['input_mask', 'segment_ids'], extra given []

In [32]:
def create_tokenizer_from_hub_module(bert_hub_url):
    """
    Get the vocab file and casing info from the Hub module.
    """
    bert_module = hub.Module(bert_hub_url)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)

    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"]
        ])
        
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


tokenizer = create_tokenizer_from_hub_module(BERT_MODEL_URL)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0517 20:42:29.558332 140356896995136 saver.py:1489] Saver not created because there are no variables in the graph to restore


In [38]:
tokenizer.tokenize("first you need do do it")

['first', 'you', 'need', 'do', 'do', 'it']

In [51]:
from collections import namedtuple
from functools import partial

        
InputExample = namedtuple('InputExample', ['guid', 'text_a', 'text_b'])

InputFeatures = namedtuple('InputFeatures', ['guid', 'tokens', 'input_ids', 'input_mask', 'input_type_ids'])

        
def pad(l, n, pad):
    """
    Pad the list 'l' to have size 'n' using 'padding_element'
    """
    return l + [pad] * (n - len(l))
                                     

In [53]:
docs = [
    'First you need to try',
    'Then you need to try it harder'
]

examples = [
    InputExample(guid=i, text_a=doc, text_b=None)
    for i, doc in enumerate(docs)
]


def convert_single_example(tokenizer, example, max_seq_len=256):
    """
    Convert a single `InputExample example` to the Bert input format
    """
    tokens = tokenizer.tokenize(example.text_a)
    
    if len(tokens) > max_seq_len - 2:
        tokens = tokens[0:(max_seq_len - 2)]
        
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_type_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)
    
    input_ids = pad(input_ids, max_seq_len, 0)
    input_type_ids = pad(input_type_ids, max_seq_len, 0)
    input_mask = pad(input_mask, max_seq_len, 0)
    
    input_features = InputFeatures(example.guid, tokens, input_ids, input_mask, input_type_ids) 
    
    return input_features
    
    

def convert_examples_to_features(tokenizer, examples, max_seq_len=256):
    """
    Convert raw features to Bert specific representation
    """
    converter = partial(convert_single_example, tokenizer=tokenizer, max_seq_len=max_seq_len)
    examples = [converter(example=example) for example in examples]
    return examples

    
convert_examples_to_features(tokenizer, examples)

[InputFeatures(guid=0, tokens=['[CLS]', 'first', 'you', 'need', 'to', 'try', '[SEP]'], input_ids=[101, 2034, 2017, 2342, 2000, 3046, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], input_mask=[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
train = convert_examples_to_features(tokenizer, train_examples, max_seq_length=MAX_SEQ_LEN)
train_input_ids, train_input_masks, train_segment_ids, train_labels = train

In [35]:
bert.run_classifier

<module 'bert.run_classifier' from '/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/bert/run_classifier.py'>

In [49]:
a = [1,1,1,1]

def pad(l, n, pad):
    """
    Pad the list 'l' to have size 'n' using 'padding_element'
    """
    return l + [pad] * (n - len(l))
                             
pad(a, 4, 0)                             

[1, 1, 1, 1]