This is the code and setup used to run the Single Task Models with GloVe Embeddings

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional 
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from utils.utils import glove_embeddings_whole_training

In [2]:
# Read in the data
olid_training = pd.read_csv('~/Desktop/ANLY590_Final/dataset/olid-training-processed.tsv', sep = '\t')
olid_test_A = pd.read_csv('~/Desktop/ANLY590_Final/dataset/testset-levela-processed.tsv', sep = '\t')
olid_test_B = pd.read_csv('~/Desktop/ANLY590_Final/dataset/testset-levelb-processed.tsv', sep = '\t')
olid_labels_A = pd.read_csv('~/Desktop/ANLY590_Final/dataset/labels-levela.csv', header = None); olid_labels_A.columns = ['id', 'subtask_a']
olid_labels_B = pd.read_csv('~/Desktop/ANLY590_Final/dataset/labels-levelb.csv', header = None); olid_labels_B.columns = ['id', 'subtask_b']

In [4]:
tweets = olid_training['tweet']

X_train = glove_embeddings_whole_training(tweets, './GLOVE/glove.6B.300d.txt')

y_train_a = olid_training['subtask_a']                          # Get Labels
y_train_a = np.where(y_train_a == "OFF", 1, 0)                  # Transform to Binary Labels

tweets = olid_test_A['tweet']
X_valid = glove_embeddings_whole_training(tweets, './GLOVE/glove.6B.300d.txt')

y_valid_a = olid_labels_A['subtask_a']
y_valid_a = np.where(y_valid_a == "OFF", 1, 0)

In [5]:
# Create the models
model_task_A = Sequential()
model_task_A.add(Bidirectional(LSTM(100, return_sequences = True), input_shape = (None, 300)))
model_task_A.add(Dropout(.2))
model_task_A.add(Bidirectional(LSTM(100)))
model_task_A.add(Dropout(.2))
model_task_A.add(Dense(32, activation = 'relu'))
model_task_A.add(Dropout(.2))
model_task_A.add(Dense(16, activation = 'relu'))
model_task_A.add(Dropout(.2))
model_task_A.add(Dense(4, activation = 'relu'))
model_task_A.add(Dropout(.2))
model_task_A.add(Dense(1, activation = 'sigmoid'))

model_task_A.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics = ['accuracy'])
model_task_A.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, None, 200)         320800    
_________________________________________________________________
dropout (Dropout)            (None, None, 200)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               240800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                6432      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                5

In [7]:
model_task_A.fit(X_train, y_train_a, 
                 epochs = 1, 
                 batch_size = 32, 
                 validation_data = [X_valid, y_valid_a])

Train on 13240 samples, validate on 860 samples


<tensorflow.python.keras.callbacks.History at 0x7f50147e7c50>

In [8]:
y_pred_a = model_task_A.predict_classes(X_valid)
print(classification_report(y_pred_a, y_valid_a))
print('Task A finished.')

              precision    recall  f1-score   support

           0       0.69      0.84      0.76       503
           1       0.68      0.45      0.54       357

    accuracy                           0.68       860
   macro avg       0.68      0.65      0.65       860
weighted avg       0.68      0.68      0.67       860

Task A finished.


## Note:
These numbers aren't exactly the same as the ones from the poster / paper as this is being reformatted / rerun after the poster was submitted.

In [9]:
# Get the data ready
filter = (olid_training['subtask_b'] == "UNT") | (olid_training['subtask_b'] == "TIN")
task_B_data = olid_training[filter]

tweets = task_B_data['tweet']
X_train = glove_embeddings_whole_training(tweets, './GLOVE/glove.6B.300d.txt')

y_train_b = task_B_data['subtask_b']                            # Get Labels
y_train_b = np.where(y_train_b == "UNT", 1, 0)                  # Transform to Binary Labels

tweets = olid_test_B['tweet']
X_valid = glove_embeddings_whole_training(tweets, './GLOVE/glove.6B.300d.txt')

y_valid_b = olid_labels_B['subtask_b']
y_valid_b = np.where(y_valid_b == "UNT", 1, 0)

In [10]:
# Create the Model
model_task_B = Sequential()
model_task_B.add(Bidirectional(LSTM(100, return_sequences = True), input_shape = (None, 300)))
model_task_B.add(Dropout(.2))
model_task_B.add(Bidirectional(LSTM(100)))
model_task_B.add(Dropout(.2))
model_task_B.add(Dense(32, activation = 'relu'))
model_task_B.add(Dropout(.2))
model_task_B.add(Dense(16, activation = 'relu'))
model_task_B.add(Dropout(.2))
model_task_B.add(Dense(4, activation = 'relu'))
model_task_B.add(Dropout(.2))
model_task_B.add(Dense(1, activation = 'sigmoid'))

model_task_B.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics = ['accuracy'])
model_task_B.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, None, 200)         320800    
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 200)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               240800    
_________________________________________________________________
dropout_6 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                6432      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)               

In [11]:
model_task_B.fit(X_train, y_train_b, 
                 epochs = 5, 
                 batch_size = 32, 
                 validation_data = [X_valid, y_valid_b])

Train on 4400 samples, validate on 240 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f516fea9250>

In [12]:
y_pred_b = model_task_B.predict_classes(X_valid)
print(classification_report(y_pred_b, y_valid_b))
print('Task B finished.')

              precision    recall  f1-score   support

           0       0.96      0.89      0.93       230
           1       0.07      0.20      0.11        10

    accuracy                           0.86       240
   macro avg       0.52      0.55      0.52       240
weighted avg       0.93      0.86      0.89       240

Task B finished.
