## This only includes bert model,
## To use the model, requires to get tokenization for bert

In [1]:
# download pre-trained model
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2019-11-06 00:41:40--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.15.128, 2a00:1450:400c:c0a::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.15.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2019-11-06 00:41:44 (112 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [2]:
!unzip /content/uncased_L-12_H-768_A-12.zip

Archive:  /content/uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [3]:
!pip install bert-tensorflow

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████▉                           | 10kB 22.8MB/s eta 0:00:01[K     |█████████▊                      | 20kB 4.5MB/s eta 0:00:01[K     |██████████████▋                 | 30kB 6.3MB/s eta 0:00:01[K     |███████████████████▍            | 40kB 4.2MB/s eta 0:00:01[K     |████████████████████████▎       | 51kB 5.1MB/s eta 0:00:01[K     |█████████████████████████████▏  | 61kB 6.1MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 4.3MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [4]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K

## Below is for getting data < IMDB LARGE >

In [5]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))

  return train_df, test_df

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,Have to be honest and say that I haven't seen ...,8,1
1,"More wide-eyed, hysterical 50s hyper-cheerfuln...",3,0
2,They had me from the first show.<br /><br />We...,9,1
3,This movie bewilders me. It may be that I'm ju...,3,0
4,Excellent episode movie ala Pulp Fiction. 7 da...,10,1


In [0]:
max_seq_length = 256

# Create datasets (Only take up to max_seq_length words for memory)
train_text = train_df['sentence'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['polarity'].tolist()

test_text = test_df['sentence'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['polarity'].tolist()

In [0]:
# Setting Bert Config Setting

BERT_VOCAB = '/content/uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = '/content/uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = '/content/uncased_L-12_H-768_A-12/bert_config.json'

In [0]:
import bert
from bert import run_classifier, run_classifier_with_tfhub, modeling, tokenization

In [0]:
tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT) # <- should it be required?

In [0]:
tokenizer = tokenization.FullTokenizer(vocab_file = BERT_VOCAB, do_lower_case=True)

In [19]:
tokenizer.tokenize("This here’s an example of using the BERT tokenizer")

['this',
 'here',
 '’',
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

In [22]:
len(tokenizer.vocab)

30522

# Convert data to bert to understand

In [0]:
test_sentence = "This here’s an example of using the BERT tokenizer"

In [0]:
class InputExample(object):
  def __init__(self, guid, text_a, text_b=None, labels=None):
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.labels = labels

In [48]:
test_input_example = bert.run_classifier.InputExample(guid="", text_a=test_sentence, text_b=None, label=0)

test_input_example

<bert.run_classifier.InputExample at 0x7f0b05eb12b0>

In [52]:
bert.run_classifier.convert_examples_to_features(examples=[test_input_example], label_list=[0,1], max_seq_length=32, tokenizer=tokenizer)

[<bert.run_classifier.InputFeatures at 0x7f0b05ed0400>]

In [0]:
train_df['index'] = train_df.index

In [60]:
train_df.head()

Unnamed: 0,sentence,sentiment,polarity,index
0,Have to be honest and say that I haven't seen ...,8,1,0
1,"More wide-eyed, hysterical 50s hyper-cheerfuln...",3,0,1
2,They had me from the first show.<br /><br />We...,9,1,2
3,This movie bewilders me. It may be that I'm ju...,3,0,3
4,Excellent episode movie ala Pulp Fiction. 7 da...,10,1,4


In [0]:
# create exmple from df

def create_example(df, labels_available=True):
  '''
  create_examples(), reads data-frame and loads input text and corresponding target labels into InputExample objects.
  '''
  examples = []
  for (i, row) in enumerate(df.values):
    guid = row[3]
    text_a = row[1]
    if labels_available:
      labels = row[2]
    else:
      labels = [0]
    examples.append(bert.run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=labels))
  return examples

In [0]:
train_data = create_example(train_df)

In [69]:
print(len(train_data))
train_data[:5]

25000


[<bert.run_classifier.InputExample at 0x7f0b05f89a90>,
 <bert.run_classifier.InputExample at 0x7f0b05f89ac8>,
 <bert.run_classifier.InputExample at 0x7f0b05f899e8>,
 <bert.run_classifier.InputExample at 0x7f0b05f89a20>,
 <bert.run_classifier.InputExample at 0x7f0b05f89b00>]

In [0]:
LABEL_LIST = [0, 1]
MAX_LEN=256

In [72]:
bert.run_classifier.convert_examples_to_features(train_data[0], label_list=LABEL_LIST, max_seq_length=MAX_LEN, tokenizer=tokenizer)

TypeError: ignored