# Train Numberbatch or Glove

In [1]:
import numpy as np
import pandas as pd 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Dense,
    Input,
    LSTM,
    Embedding,
    Dropout,
    Activation,
    SpatialDropout1D
)
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from ast import literal_eval
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


In [2]:
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

full_text = all_data['tidy_tweet'][(all_data['label']==1.0) | (all_data['label']==0.0)]
y = all_data['label'][(all_data['label']==1.0) | (all_data['label']==0.0)]

In [3]:
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(full_text)

train_tokenized = tk.texts_to_sequences(full_text)
max_len = 19
X = pad_sequences(train_tokenized, maxlen=max_len)

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1992, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)

(25569, 19) (25569,)
(6393, 19) (6393,)


In [4]:
import pickle

# saving
with open('../model_wehigts/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [5]:
# embedding_path = '../data/vectors/glove.twitter.27B/glove.twitter.27B.100d.txt'
embedding_path = '../data/vectors/numberbatch-17.06.txt'
embed_size = 100

max_features = 30000

def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [6]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
import sys
sys.path.append("../") 
from personal_library.sce_keras.loss_functions import f1_loss
from personal_library.sce_keras.metrics_functions import f1
from personal_library.sce_keras.callbacks import (
    LearningRateDecay,
    WarmUpCosineDecayScheduler
)


num_classes = 1
batch_size = 32
epochs = 200
learnRate = 0.001
lstm_out = 200
warmup_epoch = 20

lrate_decay = LearningRateDecay(epochs, learnRate).step_decay
warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learnRate,
                                        warmup_learning_rate=0,
                                        warmup_epoch=warmup_epoch,
                                        hold_base_rate_steps=5,
                                        verbose=0)

checkpoint_path = "../model_wehigts/6_g_w.hdf5"
checkpoint_path1 = "../model_wehigts/6_g_ch.hdf5"
checkpointer = ModelCheckpoint(filepath=checkpoint_path,
                               monitor='val_loss', verbose=2,
                               save_best_only=True, mode='min')
checkpointer1 = ModelCheckpoint(filepath=checkpoint_path1,
                               monitor='val_loss', verbose=2,
                               save_best_only=False, mode='min')

inp = Input(shape = (max_len,))
x = Embedding(nb_words+1, embed_size, weights = [embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.3)(x)
x = LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5)(x)
x = Dense(1, activation = "sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.summary()

# 'binary_crossentropy'
model.compile(loss=f1_loss, 
              optimizer='adam', 
              metrics=['accuracy', f1]) 

history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[checkpointer, checkpointer1, lrate_decay])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 19)                0         
_________________________________________________________________
embedding (Embedding)        (None, 19, 100)           1516700   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 19, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               240800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 1,757,701
Trainable params: 241,001
Non-trainable params: 1,516,700
_________________________________________________________________
Train on 25569 samples, validate on 6393 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.87493, saving model

In [None]:
from sklearn.metrics import f1_score

#Load best model
model.load_weights(checkpoint_path)
y_pred = model.predict(x_val, batch_size=1)
y_pred = np.where(y_pred > 0.5, 1, 0)

print("Neural Network f1_sklearn: {}".format(f1_score(y_val, y_pred)))