In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

In [None]:
# tokenization file 
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import tokenization

#### loading the tokenizer file 

In [None]:
%%time
## loading bert from tensorhub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

A [CLS] token is inserted at the beginning of the first sentence and a [SEP] token is inserted at the end of each sentence.

A sentence embedding indicating Sentence A or Sentence B is added to each token. Sentence embeddings are similar in concept to token embeddings with a vocabulary of 2.

A positional embedding is added to each token to indicate its position in the sequence. The concept and implementation of positional embedding are presented in the Transformer paper.


For bert,every input embedding is a combination of 3 embeddings:

**Position Embeddings**: BERT learns and uses positional embeddings to express the position of words in a sentence. These are added to overcome the limitation of Transformer which, unlike an RNN, is not able to capture “sequence” or “order” information

S**egment Embeddings**: BERT can also take sentence pairs as inputs for tasks (Question-Answering). That’s why it learns a unique embedding for the first and the second sentences to help the model distinguish between them. In the above example, all the tokens marked as EA belong to sentence A (and similarly for EB)


**Token Embeddings**: These are the embeddings learned for the specific token from the WordPiece token vocabulary

In [None]:
sample = tokenizer.tokenize('THis new technique is State of the art models')
# so this is how our bert based tokenizer works 
input_seq = ["[CLS]"] + sample + ["[SEP]"]
input_seq
token = tokenizer.convert_tokens_to_ids(input_seq) # this convert all the list of tokens into a ids 
pad_len = 512 - len(token)
token = token + [0] * pad_len 
# on this step we are padding and making  every sequence equal to 512 length 
len(token) 
# so far token becomes our first input for bert 


In [None]:
# its time for the second input for the bert 
pad_masks = [1] * len(input_seq) + [0] * pad_len
pad_masks  # this will be 1, and 0 and it will show which are padded and which are not 
# it improves the accuracy of distinguishing padded and not padded 
# same as we used mask_zero = True in embedding layer or same as masked layer 

In [None]:
# now we neeed to create  a function to encode data in the form of bert input 
def bert_encode(texts, tokenizer, max_len=512):
    # bert can support max length of 512 only 
    # here we need 3 data inputs for bert training and fine tuning 
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2] # here we are trimming 2 words if they getting bigger than 512
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
train = pd.read_csv("../input/nlp-with-disaster-tweets-cleaning-data/train_data_cleaning.csv", usecols=['text','target'])
test = pd.read_csv("../input/nlp-with-disaster-tweets-cleaning-data/test_data_cleaning.csv", usecols = ['text'])


In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

In [None]:
train_input[1][1]

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    batch_size=16
)

model.save('model.h5')

In [None]:
test_pred = model.predict(test_input)


In [None]:
prediction = np.where(test_pred>.5, 1,0)

In [None]:
test['prediction'] = prediction


In [None]:
test[test.prediction == 1]