In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [15]:
import random as python_random
import json
import argparse
import time
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers import Embedding, LSTM, Input, Dropout, BatchNormalization, concatenate, SpatialDropout1D, Reshape
from keras.layers import Dropout, Conv1D, MaxPooling1D, LSTM,Concatenate, Dense, GlobalMaxPooling1D,GlobalAveragePooling1D, Lambda, Bidirectional, GRU 
from keras.initializers import Constant
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from tensorflow.keras.layers import TextVectorization
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import tensorflow as tf
from gensim.models import FastText
import warnings
warnings.filterwarnings("ignore")
# Make reproducible as much as possible
np.random.seed(1234)
tf.random.set_seed(1234)
python_random.seed(1234)

In [14]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [51]:

#Read the glove embeddings
def read_embeddings(glove_vec):
    embeddings_index = {}
    with open(glove_vec, 'r', encoding='UTF-8') as f:
        for line in f:
          values = line.split();
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs

    print("Found %s word vectors." % len(embeddings_index))

    return embeddings_index


#Create embedding layer
def get_emb_matrix(emb,voc, maxLen,dim):
    '''Get embedding matrix given vocab and the embeddings'''
    num_tokens = len(voc)+1
    # Prepare embedding matrix to the correct size
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in voc.items():
      embedding_vector = emb.get(word)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    embedding_layer = Embedding(input_dim=num_tokens, output_dim=dim, input_length=maxLen,
                                weights=[embedding_matrix], trainable=True)
    # Final matrix with pretrained embeddings that we can feed to embedding layer
    return embedding_layer

In [38]:
def lstm_model(input_shape, embedding_layer):
    adam = Adam(learning_rate=5e-5)

    X_indices = Input(input_shape)

    x = embedding_layer(X_indices)
    x = Bidirectional(LSTM(200, return_sequences=False)) (x)
    x = Dense(1, activation = "sigmoid")(x)

    model = Model(inputs=X_indices, outputs=x)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [28]:
def train_model(model, X_train_indices, Y_train_bin, X_dev_indices, Y_dev_bin, epoch_size,
                          batch_size, encoder, output_file):
    verbose = 1
    batch_size = batch_size
    epochs = epoch_size
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', restore_best_weights='True', patience=6)

    model.fit(X_train_indices, Y_train_bin,
              verbose=verbose,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=[callback],
              validation_data=(X_dev_indices, Y_dev_bin))

       
    test_set_predict(model, X_dev_indices, Y_dev_bin, "val", encoder, output_file)
    return model

In [36]:
def test_set_predict(model, X_dev_indices, Y_dev_bin, ident , encoder, output_file):
    '''Do predictions and measure accuracy on our own test set (that we split off train)'''
    # Get predictions using the trained model
    Y_pred = model.predict(X_dev_indices)
    # Finally, convert to numerical labels to get scores with sklearn
    Y_pred = Y_pred>0.5
  
    if output_file:
      pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/output_lstmdev.csv')
    print(classification_report(Y_dev_bin, Y_pred,target_names= ["OFF",'NOT']))

    print('Accuracy on own {1} set: {0}'.format(round(accuracy_score(Y_dev_bin, Y_pred), 3), ident))

In [55]:
if __name__ == "__main__":

    # Read in the data and embeddings
    
    train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
    val = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_val.csv')
    test = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_test.csv')
    embeddings = read_embeddings('/content/gdrive/MyDrive/Data/glove.twitter.27B.100d.txt')


    X_test, Y_test = test['preprocessed'], test['task']
    X_train, Y_train = train['preprocessed'], train['task']
    X_dev, Y_dev = val['preprocessed'], val['task']
  
  #declare and initialise values
    tokenizer=Tokenizer()
    epoch_size = 20
    batch_size = 32
    maxLen = 100
    embedding_dim = 100
    lstm_pretrained = True
    output_file = False
    val_set= False
    test_file = True
    
   #converting to index and create embedding matrix
    voc = tokenizer.word_index
    embedding_layer = get_emb_matrix(embeddings,voc, maxLen,embedding_dim)

    #Convert classes to one-hot encoding
    encoder = LabelBinarizer()
    encoder = encoder.fit(Y_train.tolist())
    Y_train_bin = encoder.transform(Y_train.tolist())
    Y_dev_bin = encoder.transform(Y_dev.tolist())
    Y_test_bin = encoder.fit_transform(Y_test)
   

    #convert the X_train, X_dev and X_test to sequences
    X_train_indices = tokenizer.texts_to_sequences(X_train)
    X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
   
    X_dev_indices = tokenizer.texts_to_sequences(X_dev)
    X_dev_indices = pad_sequences(X_dev_indices, maxlen=maxLen, padding='post')

    X_test_indices = tokenizer.texts_to_sequences(X_test)
    X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')


    filename = "/content/gdrive/MyDrive/Data/lstm"
    if lstm_pretrained:
      model = tf.keras.models.load_model(filename)
      if val_set:
        test_set_predict(model, X_dev_indices, Y_dev_bin, "val", encoder, output_file)
      else:
        test_set_predict(model, X_test_indices, Y_test_bin, "test", encoder, output_file)
    else:
      model = lstm_model(maxLen, embedding_layer)
      model = train_model(model, X_train_indices, Y_train_bin, X_dev_indices, Y_dev_bin, epoch_size,
                          batch_size, encoder, output_file)
      model.save(filename)

    
      if test_file:
        test_set_predict(model, X_test_indices, Y_test_bin, "test",encoder,output_file)


Found 1193514 word vectors.
              precision    recall  f1-score   support

         OFF       0.72      1.00      0.84       620
         NOT       0.00      0.00      0.00       239

    accuracy                           0.72       859
   macro avg       0.36      0.50      0.42       859
weighted avg       0.52      0.72      0.61       859

Accuracy on own test set: 0.722
