In [1]:
import sys
sys.path.append("../") 
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Dense,
    Input,
    LSTM,
    Embedding,
    Dropout,
    Activation,
    SpatialDropout1D
)
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from ast import literal_eval
from sklearn.model_selection import train_test_split
from tensorflow.python.keras import backend as K


sess = tf.Session()
K.set_session(sess)

In [2]:
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

full_text = all_data['tidy_tweet'][(all_data['label']=='1.0') | (all_data['label']=='0.0')]
y = all_data['label'][(all_data['label']=='1.0') | (all_data['label']=='0.0')]

In [3]:
max_len = 120

texts = full_text.tolist()
texts = [' '.join(t.split()[:max_len]) for t in texts]
texts = np.array(texts, dtype=object)[:, np.newaxis]

x_train, x_val, y_train, y_val = train_test_split(texts, y, random_state=1992, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)

(25569, 1) (25569,)
(6393, 1) (6393,)


In [4]:
from personal_library.NLP.tokenizers.bert_token import (
    create_tokenizer_from_hub_module,
    convert_text_to_examples,
    convert_examples_to_features
)

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(x_train, y_train)
test_examples = convert_text_to_examples(x_val, y_val)

# Convert to features
train_input_ids, train_input_masks, train_segment_ids, train_labels = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_len)
test_input_ids, test_input_masks, test_segment_ids, test_labels = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_len)

W0709 16:28:12.116470 140736058794880 deprecation_wrapper.py:119] From ../personal_library/NLP/tokenizers/bert_token.py:16: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0709 16:28:20.006681 140736058794880 deprecation_wrapper.py:119] From ../personal_library/NLP/bert/tokenization.py:126: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



HBox(children=(IntProgress(value=0, description='Converting examples to features', max=25569, style=ProgressSt…




HBox(children=(IntProgress(value=0, description='Converting examples to features', max=6393, style=ProgressSty…




In [7]:
from personal_library.sce_keras.loss_functions import f1_loss
from personal_library.sce_keras.metrics_functions import f1
from personal_library.sce_keras.layers.bert import BertLayer
from personal_library.sce_keras.callbacks import (
    LearningRateDecay,
    WarmUpCosineDecayScheduler
)


num_classes = 1
batch_size = 32
epochs = 2
learnRate = 0.001
lstm_out = 200
warmup_epoch = 1

warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learnRate,
                                        warmup_learning_rate=0,
                                        warmup_epoch=warmup_epoch,
                                        hold_base_rate_steps=5,
                                        verbose=0)

checkpoint_path = "../model_wehigts/9_w.hdf5"
checkpoint_path1 = "../model_wehigts/9_ch.hdf5"
checkpointer = ModelCheckpoint(filepath=checkpoint_path,
                               monitor='val_loss', verbose=2,
                               save_best_only=True, mode='min')
checkpointer1 = ModelCheckpoint(filepath=checkpoint_path1,
                               monitor='val_loss', verbose=2,
                               save_best_only=False, mode='min')

# Create inputs
in_id = Input(shape=(max_len,), name="input_ids")
in_mask = Input(shape=(max_len,), name="input_masks")
in_segment = Input(shape=(max_len,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

x = BertLayer(n_fine_tune_layers=3, pooling="first")(bert_inputs)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dense(num_classes, activation="sigmoid")(x)

model = Model(inputs=bert_inputs, outputs=x)
model.summary()

# 'binary_crossentropy'
model.compile(loss='binary_crossentropy',#f1_loss, 
              optimizer='adam', 
              metrics=['accuracy', f1])

sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

history = model.fit([train_input_ids, train_input_masks, train_segment_ids], train_labels,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
                    callbacks=[checkpointer, checkpointer1, warm_up_lr])

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 120)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 120)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 120)]        0                                            
__________________________________________________________________________________________________
bert_layer_2 (BertLayer)        (None, 768)          110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]          


KeyboardInterrupt



In [None]:
from sklearn.metrics import f1_score, accuracy_score

#Load best model
model.load_weights(checkpoint_path)
y_pred = model.predict(x_val, batch_size=1)
y_pred = np.where(y_pred > 0.5, 1, 0)

print("Own emmbeding f1_sklearn: {}".format(f1_score(y_val.astype(float), y_pred)))
print("Own emmbeding accuracy: {}".format(accuracy_score(y_val.astype(float), y_pred)))