Forked:

**Disaster NLP: Keras BERT using TFHub** - https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install keras
!pip install tensorflow-hub
!pip install tensorflow 

In [None]:
import tokenization
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from keras.callbacks import ModelCheckpoint

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [None]:
plt.style.use('fivethirtyeight')
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
def plot(history, arr):

    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=18)
        ax[idx].set_xlabel('A ',fontsize=16)
        ax[idx].set_ylabel('B',fontsize=16)
        ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)

In [None]:
bert_layer = \
hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2", 
               trainable=False)

In [None]:
max_len = max([len(x.split()) for x in dataset.text]) + 1

In [None]:
def bert_encode(texts, tokenizer, max_len=None):
    
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [None]:
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(dataset.text.values, tokenizer, max_len=max_len)
train_labels = dataset.target.values

test_input = bert_encode(test.text, tokenizer, max_len=max_len)

In [None]:
all_inputs = [
    Input(shape=(max_len,), dtype=tf.int32),
    Input(shape=(max_len,), dtype=tf.int32),
    Input(shape=(max_len,), dtype=tf.int32)
]

__, sequence_output = bert_layer(all_inputs)

x = sequence_output[:, 0, :]
x = Dropout(0.1)(x)
x = Dense(units=32, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(all_inputs, outputs=x)
model.compile(Adam(lr= 0.01), 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.summary()

In [None]:
checkpoint = \
ModelCheckpoint('model.h5', 
                 monitor='val_loss', 
                 save_best_only=True, 
                 verbose=1)

train_history = \
model.fit(train_input, 
          train_labels,
          validation_split=0.2,
          epochs=5,
          callbacks=[checkpoint],
          batch_size=16)

In [None]:
plot(train_history, [['loss', 'val_loss'],
                     ['accuracy', 'val_accuracy']])

In [None]:
model.load_weights('model.h5')

In [None]:
submission.target =  \
model\
    .predict(test_input)\
    .round()\
    .astype(int)

submission.to_csv("submission.csv", index=False)

In [None]:
plt.figure(figsize=(10, 8))
submission\
    .target\
    .value_counts()\
    .plot\
    .bar();

<a href="submission.csv"> Download File </a>