In [2]:
!wget https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/tools/tokenization.py

--2023-10-03 11:21:48--  https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/tools/tokenization.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16591 (16K) [text/plain]
Saving to: ‘tokenization.py’


2023-10-03 11:21:48 (23.8 MB/s) - ‘tokenization.py’ saved [16591/16591]



In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Load and Preprocess

- Load BERT from the Tensorflow Hub
- Load CSV files containing training data
- Load tokenizer from the bert layer
- Encode the text into tokens, masks, and segment flags

In [6]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 1min 19s, sys: 11.2 s, total: 1min 30s
Wall time: 1min 33s


In [7]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [8]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [9]:
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.text.values, tokenizer, max_len=160)
train_labels = train.target.values

# Model: Build, Train, Predict, Submit

In [10]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [11]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=1,
    callbacks=[checkpoint],
    batch_size=16
)

Train on 6090 samples, validate on 1523 samples


In [None]:
model.load_weights('model.h5')
test_pred = model.predict(test_input)

In [None]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)

In [71]:
def ex(examples):
    examples_enc = bert_encode(examples, tokenizer, max_len=160)
    predictions = model.predict(examples_enc).round()
    for x, y in zip(predictions, examples):
        print('* ', 'True' if x.item() else 'Fake', ':\n', y, sep='', end='\n\n')

In [72]:
examples = [
#     'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
#     'Forest fire near La Ronge Sask. Canada',
#     '13,000 people receive #wildfires evacuation orders in California',
#     'Crying out for more! Set me ablaze',
#     'The BBC Monitoring Forced Catastrophe: El Salvador sends aid packages to displaced people',
#     'I added this cake to my cake list to help prepare for the wedding',
    """MTV crews on motorcycles hooting and a truck in a tree fall off a cliff..??
So you feel guilty when you don't get by because you didn't give your all?""",
    """Obama Declares Disaster for Typhoon-Devastated Saipan""",
    """The Best Moments After Hiroshima: The Great Con Game 
It all started when a young Japanese man named Kenji Higashikawa escaped captivity at Amatsu... via YouTube
Photos: The Sex Workers Forced To Be Hunted The World Over""",
    """A dog attack dog attack that injured a school bus last month has been caught on tape. FOXNewYork
Accident fatality rate ratio ratio ratio bicyclist fatalities percent 
'He is so fast.' I asked what he was"""
]

ex(examples)

* Fake:
MTV crews on motorcycles hooting and a truck in a tree fall off a cliff..??
So you feel guilty when you don't get by because you didn't give your all?

* True:
Obama Declares Disaster for Typhoon-Devastated Saipan

* True:
The Best Moments After Hiroshima: The Great Con Game 
It all started when a young Japanese man named Kenji Higashikawa escaped captivity at Amatsu... via YouTube
Photos: The Sex Workers Forced To Be Hunted The World Over

* Fake:
A dog attack dog attack that injured a school bus last month has been caught on tape. FOXNewYork
Accident fatality rate ratio ratio ratio bicyclist fatalities percent 
'He is so fast.' I asked what he was

