In [1]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers



In [7]:
with open('Word2Vec_X_data.pkl', 'rb') as f:
    X_data = pickle.load(f)
with open('Word2Vec_y_data.pkl', 'rb') as f:
    y_data = pickle.load(f)
with open('Word2Vec_embedding_matrix.pkl', 'rb') as f:
    embedding_matrix = pickle.load(f)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)

In [9]:
def plot_history(history):
    """ A method taken from https://realpython.com/python-keras-text-classification/#what-is-a-word-embedding that
        effectively plots the training and validation accuracy and loss data and diplays it. """
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [10]:
vocab_size = 28938
embedding_dim = 150
maxlen = 261741
num_filters = 32

In [14]:
#Creation of the neural network with the pretrained embedding matrix.
model_with_embedding = Sequential()
model_with_embedding.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model_with_embedding.add(layers.LSTM(50))
#model_with_embedding.add(layers.GlobalMaxPooling1D())
model_with_embedding.add(layers.Dense(50, activation='relu'))
model_with_embedding.add(layers.Dense(1, activation='sigmoid'))
model_with_embedding.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model_with_embedding.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 261741, 150)       4340700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                40200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 4,383,501
Trainable params: 42,801
Non-trainable params: 4,340,700
_________________________________________________________________


In [None]:
history = model_with_embedding.fit(X_train, y_train,
                    batch_size=1,
                    epochs=5,
                    verbose=1,
                    validation_data=(X_test, y_test))

Train on 623 samples, validate on 156 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/5


In [None]:
plot_history(history)