In [3]:
!pip3 install tensorflow_hub

Collecting tensorflow_hub
  Using cached tensorflow_hub-0.9.0-py2.py3-none-any.whl (103 kB)
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.9.0


In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.2.0
Hub version:  0.9.0


In [2]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

In [3]:
MAX_SEQ_LEN=70
input_word_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="segment_ids")

In [4]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))
 
def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [5]:
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [6]:
FullTokenizer= bert.bert_tokenization.FullTokenizer

vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()

do_lower_case=bert_layer.resolved_object.do_lower_case.numpy()

tokenizer=FullTokenizer(vocab_file,do_lower_case)

def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [7]:
def create_single_input(sentence,MAX_LEN):
  
  stokens = tokenizer.tokenize(sentence)
  
  stokens = stokens[:MAX_LEN]
  
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)
 
  return ids,masks,segments
 
def create_input_array(sentences):
 
  input_ids, input_masks, input_segments = [], [], []
 
  for sentence in tqdm(sentences,position=0, leave=True):
  
    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2)
 
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)
 
  return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

In [8]:
print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [16]:
x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(1, activation="sigmoid", name="dense_output")(x)
 
model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
opt = tf.keras.optimizers.Adam(learning_rate=0.03368973499)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [17]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 70)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 70)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 70)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [18]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
df = pd.read_json('/home/jovyan/data/train.jsonl', lines=True)
df_dev = pd.read_json('/home/jovyan/data/dev.jsonl', lines=True)
df_test_f = pd.read_json('/home/jovyan/data/test.jsonl', lines=True)
df = df[["label","text"]]
df_dev = df_dev[["label","text"]]
#df = pd.concat([df, df_dev], ignore_index=True)
df[df.label == 1].head(2)

Unnamed: 0,label,text
10,1,jew mad? get fuhrerious!
12,1,brother... a day without a blast is a day wasted


In [19]:
x_train = df["text"]
y_train = df["label"]

x_test = df_dev["text"]
y_test = df_dev["label"]

In [20]:
df.count()
print(x_train.count(), x_test.count())

8500 500


In [21]:
inputs=create_input_array(x_train)

100%|██████████| 8500/8500 [00:03<00:00, 2700.29it/s]


In [22]:
inputs[0].shape

(8500, 70)

In [24]:
model.fit(inputs,y_train,epochs=1, batch_size=64, validation_split=.2, steps_per_epoch=300)





KeyboardInterrupt: 

In [None]:
# Calling `save('my_model.h5')` creates a h5 file `my_model.h5`.
#model.save("bert_h5_model.h5")

# It can be used to reconstruct the model identically.
# model = keras.models.load_model("bert_h5_model.h5")

In [25]:
test_inputs=create_input_array(x_test)
y_pred = model.predict(test_inputs).ravel()

100%|██████████| 500/500 [00:01<00:00, 324.22it/s]


In [27]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred)
auc_keras = auc(fpr_keras, tpr_keras)
print(auc_keras)

0.4963040000000001


In [None]:
# final test data
xtest_f = df_test_f["text"]
xtestf= create_input_array(xtest_f)

y_test_f_pred = model.predict(xtestf).ravel()

100%|██████████| 1000/1000 [00:00<00:00, 3507.05it/s]


In [None]:
from datetime import datetime
df_to_submit = df_test_f[["id"]]
df_to_submit["proba"] = y_test_f_pred
df_to_submit["label"] = df_to_submit['proba'].map(lambda x: 1 if x > 0.5 else 0)
df_to_submit.head()

#print(df_test_f[df_test_f.label == 1])
df_to_submit.to_csv('/home/jovyan/data/csv_to_submit' + datetime.now().strftime("%Y%m%d-%H%M%S") +'.csv', index = False)
