# Train Numberbatch or Glove

In [1]:
import os
import gc
import csv
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping
from ast import literal_eval
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
df = pd.read_csv('../data/pandas_data_frame.csv', index_col=0)
all_data = df.where((pd.notnull(df)), '')
all_data['hashtag'] = all_data['hashtag'].apply(literal_eval)

full_text = all_data['tidy_tweet'][(all_data['label']==1.0) | (all_data['label']==0.0)]
y = all_data['label'][(all_data['label']==1.0) | (all_data['label']==0.0)]

In [3]:
tk = Tokenizer(lower=True, filters='')
tk.fit_on_texts(full_text)

train_tokenized = tk.texts_to_sequences(full_text)
max_len = 19
X = pad_sequences(train_tokenized, maxlen=max_len)

x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=1992, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_val.shape,y_val.shape)

(25569, 19) (25569,)
(6393, 19) (6393,)


In [6]:
embedding_path = '../data/vectors/glove.twitter.27B/glove.twitter.27B.50d.txt'
embed_size = 50

max_features = 30000

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [7]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [9]:
import sys
sys.path.append("../") 
from personal_library.sce_keras.loss_functions import f1_loss
from personal_library.sce_keras.metrics_functions import f1


num_classes = 1
batch_size = 32
epochs = 100
learnRate = 0.00001
checkpoint_path = "../model_wehigts/lstm_glove_w.hdf5"
checkpointer = ModelCheckpoint(filepath=checkpoint_path,
                               monitor='val_loss', verbose=1,
                               save_best_only=True, mode='min')
units = 128
kernel_size1 = 4
spatial_dr = 0.5
kernel_size2 = 4
dense_units = 64 
dr=0.2
conv_size=32


inp = Input(shape = (max_len,))
x = Embedding(nb_words+1, embed_size, weights = [embedding_matrix], trainable=False)(inp)
x1 = SpatialDropout1D(spatial_dr)(x)

x_gru = Bidirectional(GRU(units, return_sequences=True))(x1)
x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
avg_pool1_gru = GlobalAveragePooling1D()(x1)
max_pool1_gru = GlobalMaxPooling1D()(x1)

x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
avg_pool3_gru = GlobalAveragePooling1D()(x3)
max_pool3_gru = GlobalMaxPooling1D()(x3)

x_lstm = Bidirectional(LSTM(units, return_sequences=True))(x1)
x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
avg_pool1_lstm = GlobalAveragePooling1D()(x1)
max_pool1_lstm = GlobalMaxPooling1D()(x1)

x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
avg_pool3_lstm = GlobalAveragePooling1D()(x3)
max_pool3_lstm = GlobalMaxPooling1D()(x3)


x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
x = BatchNormalization()(x)
x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
x = BatchNormalization()(x)
x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
x = Dense(1, activation = "sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.summary()
adam = Adam(lr=learnRate, beta_1=0.9, beta_2=0.999,
            epsilon=None, decay=1e-6, amsgrad=False)

# 'binary_crossentropy'
model.compile(loss=f1_loss, 
              optimizer='adam', 
              metrics=['accuracy', f1]) 

history = model.fit(x_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val),
                    callbacks=[checkpointer])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 19)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 19, 50)       758350      input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 19, 50)       0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 19, 256)      137472      spatial_dropout1d_2[0][0]        
__________________________________________________________________________________________________
conv1d_5 (


Epoch 00009: val_loss did not improve from 0.69094
Epoch 10/10

Epoch 00010: val_loss improved from 0.69094 to 0.68517, saving model to ../model_wehigts/lstm_glove_w.hdf5
