# Train deep model + own emmbedings

In [3]:
import os
import gc
import csv
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import (
    Dense,
    LSTM,
    Embedding,
    SpatialDropout1D,
)
from tensorflow.keras.models import (
    Model,
    load_model,
    Sequential
)
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from ast import literal_eval
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

full_text = all_data['tidy_tweet'][(all_data['label']==1.0) | (all_data['label']==0.0)]
y = all_data['label'][(all_data['label']==1.0) | (all_data['label']==0.0)]

In [5]:
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(full_text)

train_tokenized = tk.texts_to_sequences(full_text)
max_len = 100
X = pad_sequences(train_tokenized, maxlen=max_len)

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1992, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)

(25569, 100) (25569,)
(6393, 100) (6393,)


In [13]:
import sys
sys.path.append("../")
from personal_library.sce_keras.loss_functions import f1_loss
from personal_library.sce_keras.metrics_functions import f1
from personal_library.sce_keras.callbacks import LearningRateDecay


epochs = 200
batch_size = 128
embed_dim = 150
lstm_out = 200
max_fatures = X.max() + 1
learnRate = 0.001

lrate_decay = LearningRateDecay(epochs, learnRate).step_decay

checkpoint_path = "../model_wehigts/5_w.hdf5"
checkpoint_path1 = "../model_wehigts/5_ch.hdf5"
checkpointer = ModelCheckpoint(filepath=checkpoint_path,
                               monitor='val_loss', verbose=2,
                               save_best_only=True, mode='min')
checkpointer1 = ModelCheckpoint(filepath=checkpoint_path1,
                               monitor='val_loss', verbose=2,
                               save_best_only=False, mode='min')

model = Sequential()
model.add(Embedding(max_fatures, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(lstm_out, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1,activation='sigmoid'))


model.compile(loss = f1_loss, #'binary_crossentropy',
              optimizer='adam',
              metrics = ['accuracy', f1])
model.summary()

history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[checkpointer, checkpointer1, lrate_decay])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 150)          2275050   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 100, 150)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 2,556,051
Trainable params: 2,556,051
Non-trainable params: 0
_________________________________________________________________
Train on 25569 samples, validate on 6393 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.84793, saving model to ../model_wehigts/5_w.hdf5

Epoch 00001: saving model to ../model_wehigts/5_ch.hdf5
Epoch 2/200

Epoch 00002: val_loss improved from 0.

In [14]:
from sklearn.metrics import f1_score

#Load best model
model.load_weights(checkpoint_path)
y_pred = model.predict(x_val, batch_size=1)
y_pred = np.where(y_pred > 0.5, 1, 0)

print("Own emmbeding f1_sklearn: {}".format(f1_score(y_val, y_pred)))

Own emmbeding f1_sklearn: 0.5480427046263345
