# Train deep model + own emmbedings

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import (
    Dense,
    LSTM,
    Embedding,
    SpatialDropout1D,
)
from tensorflow.keras.models import (
    Model,
    load_model,
    Sequential
)
from tensorflow.keras.callbacks import ModelCheckpoint
from ast import literal_eval
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)


In [2]:
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

full_text = all_data['tidy_tweet'][(all_data['label']=='1.0') | (all_data['label']=='0.0')]
y = all_data['label'][(all_data['label']=='1.0') | (all_data['label']=='0.0')]

In [3]:
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(full_text)

max_len = 120 # Calculate as max in dataset see 1.data_process.ipynb
train_tokenized = tk.texts_to_sequences(full_text)
X = pad_sequences(train_tokenized, maxlen=max_len)

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1992, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)

(25569, 120) (25569,)
(6393, 120) (6393,)


In [4]:
import pickle

# saving
with open('../model_wehigts/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
import sys
sys.path.append("../")
from personal_library.sce_keras.loss_functions import f1_loss
from personal_library.sce_keras.metrics_functions import f1
from personal_library.sce_keras.callbacks import (
    LearningRateDecay,
    WarmUpCosineDecayScheduler
)


epochs = 200
batch_size = 128
embed_dim = 150
lstm_out = 200
max_fatures = X.max() + 1
learnRate = 0.001
warmup_epoch = 20

lrate_decay = LearningRateDecay(epochs, learnRate).step_decay
warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learnRate,
                                        warmup_learning_rate=0,
                                        warmup_epoch=warmup_epoch,
                                        hold_base_rate_steps=5,
                                        verbose=0)

checkpoint_path = "../model_wehigts/5_w.hdf5"
checkpoint_path1 = "../model_wehigts/5_ch.hdf5"
checkpointer = ModelCheckpoint(filepath=checkpoint_path,
                               monitor='val_loss', verbose=2,
                               save_best_only=True, mode='min')
checkpointer1 = ModelCheckpoint(filepath=checkpoint_path1,
                               monitor='val_loss', verbose=2,
                               save_best_only=False, mode='min')

model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1,activation='sigmoid'))


model.compile(loss = f1_loss,
              optimizer='adam',
              metrics = ['accuracy', f1])
model.summary()

history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[checkpointer, checkpointer1, warm_up_lr])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 150)          2994000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 120, 150)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               280800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 3,275,001
Trainable params: 3,275,001
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25569 samples, validate on 6393 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.87591, saving model to ../model_wehigts/5_w_cos.hdf5

Epoch 00001: saving model to ../model_wehigts/5_ch_cos.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.87591 to 0.87288, saving model to ../model_wehigts/5_w_cos.hdf5

Epoch 00002: saving model to ../model_wehigts/5_ch_cos.hdf5
Epoch 3/200

Epoch 00003: val_loss improved from 0.87288 to 0.87241, saving model to ../model_wehigts/5_w_cos.hdf5

Epoch 00003: saving model to ../model_wehigts/5_ch_cos.hdf5
Epoch 4/200

Epoch 00004: val_loss improved from 0.87241 to 0.87198, saving model to ../model_wehigts/5_w_cos.hdf5

Epoch 00004: saving model to ../model_wehigts/5_ch_cos.hdf5
Epoch 5/200

Epoch 00005: val_loss improved from 0.87198 to 0.85390, saving model to ../model_wehigts/5_w_cos.hdf5

Epoch 00005: saving model to ../model_wehigts/5_ch_cos.hdf5
Epoch 6/200

Epoch 00006: val_loss improved from 0.85390 to 0.82827, saving

In [None]:
from sklearn.metrics import f1_score, accuracy_score

#Load best model
model.load_weights(checkpoint_path)
y_pred = model.predict(x_val, batch_size=1)
y_pred = np.where(y_pred > 0.5, 1, 0)

print("Own emmbeding f1_sklearn: {}".format(f1_score(y_val.astype(float), y_pred)))
print("Own emmbeding accuracy: {}".format(accuracy_score(y_val.astype(float), y_pred)))