In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm
from tensorflow.keras import backend as K

# Initialize session
sess = tf.Session()


# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 256

# Data

First, we load the sample data IMDB data

In [2]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=.10).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))

    return train_df, test_df


train_df, test_df = download_and_load_datasets()
train_df.tail()

Unnamed: 0,sentence,sentiment,polarity
2495,In this 'sequel' Bruce is still called Billy L...,2,0
2496,I actually had quite high hopes going into thi...,1,0
2497,Indian cinema typifies cops of two broad categ...,9,1
2498,WOW! Why would anybody make a sequel to an alr...,1,0
2499,A couple(Janet and Richard) go camping out in ...,7,1


In [3]:
train_df = pd.read_json ('data/train.json')

train_df = train_df.sample(frac=.10).reset_index(drop=True)

labels = {0:'false', 1:'partly true', 2:'true'}

def label(x):
    return labels[x]

train_df['labelCode'] = train_df.label.apply(label)

print(train_df.labelCode.value_counts())
train_df.labelCode.value_counts().plot(kind='bar')

train_df.rename(columns={"claim": "sentence", "label": "polarity"}, inplace=True)

train_df.shape
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(train_df, test_size = 0.2, random_state = 0)
print(train_df.shape)
print(test_df.shape)
train_df.tail()

false          741
partly true    644
true           171
Name: labelCode, dtype: int64
(1244, 7)
(312, 7)


Unnamed: 0,sentence,claimant,date,id,polarity,related_articles,labelCode
763,"Barack Obama's health care plan ""would leave 1...",Hillary Clinton,2007-11-15,16211,1,"[71124, 10338]",partly true
835,"""It isn't me cutting the budget. It's the Cong...",Chuck Hagel,2014-03-02,6379,1,"[88591, 75466]",partly true
1216,Kroger's stores are offering free fruit to chi...,,2017-09-18,5257,1,"[117124, 129068, 129174, 129490]",partly true
559,"""I've cut taxes for ... middle-class families,...",Barack Obama,2012-09-06,11775,1,"[89846, 93587]",partly true
684,The creator of Pokémon said in an interview th...,,2016-08-07,9677,0,"[107982, 125133, 126014, 126595]",false


In [4]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers
encoder = LabelEncoder()


# Create datasets (Only take up to max_seq_length words for memory)
train_text = train_df['sentence'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['polarity'].tolist()
print(train_text[0], train_text.shape)
print(train_label[0])

test_text = test_df['sentence'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['polarity'].tolist()
print(test_text.shape)


encoder.fit(train_label)
train_label = encoder.fit_transform(train_label)
train_label = np_utils.to_categorical(train_label)
print(train_label.shape, train_label[0])
test_label = encoder.fit_transform(test_label)
test_label = np_utils.to_categorical(test_label)
print(test_label.shape, test_label[0])


['President Trump changed the name of Black History Month to African-American History Month.'] (1244, 1)
1
(312, 1)
(1244, 3) [0. 1. 0.]
(312, 3) [0. 1. 0.]


Using TensorFlow backend.


# Tokenize

Next, tokenize our text to create `input_ids`, `input_masks`, and `segment_ids`

In [5]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids   = [0] * max_seq_length
        input_mask  = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label       = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
#         np.array(labels).reshape(-1, 1),
        np.array(labels),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples



In [6]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)

print('train_input_ids', train_input_ids.shape)
print('train_input_masks', train_input_masks.shape)
print('train_segment_ids', train_segment_ids.shape)
print('train_labels', train_labels.shape)

(test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)
    
print('test_input_ids', test_input_ids.shape)
print('test_input_masks', test_input_masks.shape)
print('test_segment_ids', test_segment_ids.shape)
print('test_labels', test_labels.shape)

Converting examples to features: 100%|██████████| 1244/1244 [00:00<00:00, 2838.89it/s]
Converting examples to features: 100%|██████████| 312/312 [00:00<00:00, 2825.12it/s]

train_input_ids (1244, 256)
train_input_masks (1244, 256)
train_segment_ids (1244, 256)
train_labels (1244, 3)
test_input_ids (312, 256)
test_input_masks (312, 256)
test_segment_ids (312, 256)
test_labels (312, 3)





In [8]:
print('train_input_ids', train_input_ids[1])
print('train_input_masks', train_input_masks[1])
print('train_segment_ids', train_segment_ids[1])
print('train_labels', train_labels[1])

train_input_ids [  101  1037  9982  3065 15941 12055  2012  1037  3551  2942  2916  2233
  2007  3235  9678  2332  1010  3781  1012  2156  2742  1006  1055  1007
  2106 15941 12055  2428  2233  2013 28112  2000  8482  1029  2111  4025
  2000  2228  2009  1005  1055  2032  1999  2023  6302  1024  5067  3081
  1041  1011  5653  1010  2254  2355   102     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     

In [None]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError("Undefined pooling type (must be either first or mean, but is %s" % self.pooling)

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(self.bert_path, trainable=self.trainable, name="%s_module" % self.name)

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError("Undefined pooling type (must be either first or mean, but is %s" % self.pooling)

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append("encoder/layer_{str(11 - %s)}" % i)

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError("Undefined pooling type (must be either first or mean, but is %s" % self.pooling)

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [None]:
# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3, pooling="first")(bert_inputs)
    
#     encoder
    
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    pred = tf.keras.layers.Dense(3, activation='softmax')(dense)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)


In [None]:
model = build_model(max_seq_length)

# Instantiate variables
initialize_vars(sess)

model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], train_labels,
#     validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1, verbose=1,
    batch_size=32
)

In [None]:
%%time

model.save('BertModel.h5')

pre_save_preds = model.predict([test_input_ids[0:100], 
                                test_input_masks[0:100], 
                                test_segment_ids[0:100]]
                              ) # predictions before we clear and reload model

# Clear and load model
model = None
model = build_model(max_seq_length)
initialize_vars(sess)
model.load_weights('BertModel.h5')

# post_save_preds = model.predict([test_input_ids[0:100], 
#                                 test_input_masks[0:100], 
#                                 test_segment_ids[0:100]]
#                               ) # predictions after we clear and reload model
# all(pre_save_preds == post_save_preds) # Are they the same?

In [None]:
# evaluate the model
loss, accuracy = model.evaluate([test_input_ids, 
                                test_input_masks, 
                                test_segment_ids], test_labels,
                               verbose=2)
print('Accuracy: %f' % (accuracy*100))
print('Loss: %f' % loss)

print(post_save_preds)


In [None]:
%%time

from sklearn.metrics import f1_score, classification_report

y_preds = model.predict([test_input_ids, 
                                test_input_masks, 
                                test_segment_ids])



In [None]:
print(y_preds[0], y_preds.shape)
y_preds = np.argmax(y_preds, axis=1)
print(y_preds, y_preds.shape)
print(test_labels[0], test_labels.shape)
# Results
y_true = np.argmax(test_labels, axis=1)
y_preds = y_preds.reshape((y_preds.shape[0], 1))
y_true = y_true.reshape((y_true.shape[0], 1))
print(y_true)
# print(y_true.reshape((y_true.shape[0], 1)).shape)
print('sklearn Macro-F1-Score:', f1_score(y_true, y_preds, average='macro'))
print('sklearn Micro-F1-Score:', f1_score(y_true, y_preds, average='micro'))  
print('sklearn weighted-F1-Score:', f1_score(y_true, y_preds, average='weighted'))  
print('sklearn no average-F1-Score:', f1_score(y_true, y_preds, average=None))

print(classification_report(y_true, y_preds))