In [0]:
import numpy as np
import torch

seed = 1337
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
print(torch.__version__) # this should be at least 1.0.0

1.5.0+cu101


### Functions for data preparation
Altered from the fnn with embeddings solution script

added sequence padding

In [0]:
# Functions for data preparation

def sequence_padding(X_data, max_len):
    from keras.preprocessing.sequence import pad_sequences
    """
	  Pad and truncate the sequences to a certain
	  """
    sentence_sequences = pad_sequences(X_data, maxlen=max_len, dtype="long", padding="post", truncating="post", value=0)
    return sentence_sequences

def get_index(word, word2idx, freeze=False):
    """
    map words to indices
    keep special OOV token (_UNK) at position 0
    """
    if word in word2idx:
        return word2idx[word]
    else:
        if not freeze:
            word2idx[word]=len(word2idx) #new index
            return word2idx[word]
        else:
            return word2idx["_UNK"]


def convert_to_n_hot(X, vocab_size):
    out = []
    for instance in X:
        n_hot = np.zeros(vocab_size)
        for w_idx in instance:
            n_hot[w_idx] = 1
        out.append(n_hot)
    return np.array(out)


def convert_to_one_hot(Y, label2idx, label_size):
    out = []
    for instance in Y:
        one_hot = np.zeros(label_size, dtype=int)
        one_hot[label2idx[instance]] = 1
        out.append(one_hot)
    return np.array(out)


# Format required by PyTorch's cross-entropy loss
def convert_to_index(Y, label2idx, label_size):
    out = []
    for instance in Y:
        index = label2idx[instance]
        out.append(index)
    return np.array(out)


# Format required by PyTorch's nn.embedding
def convert_to_indices(X, seq_len):
    out = []
    for instance in X:
        indices = np.zeros(seq_len, dtype=np.int)
        indices[:len(instance)] = instance
        indices[len(instance):] = 0
        out.append(indices)
    return np.array(out)
    


def load_data(trainfile, devfile, testfile, embed_dim):
    ### load data
    train_sents, train_y = load_animacy_sentences_and_labels(trainfile)
    dev_sents, dev_y = load_animacy_sentences_and_labels(devfile)
    test_sents, test_y = load_animacy_sentences_and_labels(testfile)

    ### create mapping word to indices
    word2idx = {"_UNK": 0}  # reserve 0 for OOV

    ### convert training etc data to indices
    X_train = [[get_index(w,word2idx) for w in x] for x in train_sents]
    freeze=True
    X_dev = [[get_index(w,word2idx,freeze) for w in x] for x in dev_sents]
    X_test = [[get_index(w,word2idx,freeze) for w in x] for x in test_sents]

    # Get maximum length
    max_len = max([len(sequence) for sequence in X_train])
    print("max sequence length: {}".format(max_len))

    # Pad the sequences
    print("padding sequences..")
    X_train = sequence_padding(X_train, max_len)
    X_dev = sequence_padding(X_dev, max_len)
    X_test = sequence_padding(X_test, max_len)
	
    print("after word2idx, padding {}".format(X_train[0]))

    vocab_size = len(word2idx)
    print("#vocabulary size: {}".format(len(word2idx)))
          
    X_train = convert_to_indices(X_train, max_len)
    X_dev = convert_to_indices(X_dev, max_len)
    X_test = convert_to_indices(X_test, max_len)

    print("after conversion {}".format(X_train[0]))
    
    ### convert labels to one-hot
    label2idx = {label: i for i, label in enumerate(set(train_y))}
    num_labels = len(label2idx.keys())
    print("#Categories: {}, {}".format(label2idx.keys(), label2idx.values()))
    y_train = convert_to_index(train_y, label2idx, num_labels)
    print(train_y[:4], y_train[:4], len(y_train)) # sanity check
    y_dev = convert_to_index(dev_y, label2idx, num_labels)
    y_test = convert_to_index(test_y, label2idx, num_labels)
    print(dev_y[:4], y_dev[:4], len(y_dev)) # sanitiy check
    print(test_y[0], y_test[0], len(y_test)) # sanity check

    return X_train, y_train, X_dev, y_dev, X_test, y_test, word2idx, label2idx


def load_animacy_sentences_and_labels(datafile):
    import pandas as pd
    """
    loads the data set
    """
    # load offenseval2019 testset
    if 'offenseval' in datafile:
      X_data = pd.read_csv(datafile, sep="\t", header=0, names=['id', 'text'])
      # load offenseval2019 labels
      data = pd.read_csv('data/offenseval2019/labels-levela.csv', sep=",", header=None, names=['id', 'label'])
      sentences = [sentence.split() for sentence in X_data['text']]
      labels = ['1' if label == 'OFF' else '0' for label in data['label']]
    else:
      # load hateval2019 data
      input = [line.strip().split("\t")[1:3] for line in open(datafile)]
      sentences = [sentence.split() for sentence, label in input[1:]]
      labels = [label for sentence, label in input[1:]]
    
    return sentences, labels

### Read input data
loading in train-, dev-, testset

In [0]:
## read input data
print("load data..")
embed_dim = 100

# offenseval2019 test
test = 'data/offenseval2019/testset-levela.tsv'

# hateval2019 data
train = 'data/hateval2019/train/public_development_en/train_en.tsv'
dev = 'data/hateval2019/train/public_development_en/dev_en.tsv'
# test = 'data/hateval2019/test/reference_test_en/en.tsv'

X_train, y_train, X_dev, y_dev, X_test, y_test, word2idx, tag2idx = load_data(train, dev, test, embed_dim)

print("#train instances: {}\n#dev instances: {}\n#test instances: {}".format(len(X_train),len(X_dev), len(X_test)))
assert(len(X_train)==len(y_train))
assert(len(X_test)==len(y_test))
assert(len(X_dev)==len(y_dev))

vocabulary_size=len(word2idx.keys())
num_classes = len(tag2idx)
input_size = len(X_train[0])
print(input_size) # sanity check

load data..
max sequence length: 63
padding sequences..


Using TensorFlow backend.


after word2idx, padding [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
#vocabulary size: 40951
after conversion [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
#Categories: dict_keys(['0', '1']), dict_values([0, 1])
['1', '1', '1', '0'] [1 1 1 0] 9000
['0', '0', '1', '1'] [0 0 1 1] 1000
1 1 860
#train instances: 9000
#dev instances: 1000
#test instances: 860
63


### (minimal) GRU
input -> embedding layer -> GRU -> ReLU -> output layer -> output

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GRU(nn.Module):

    def __init__(self, 
                 input_dim,
                 seq_len, 
                 embed_dim
                 ):

        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        super(GRU, self).__init__()
        ### BEGIN Addition of an embedding layer to the network
        self.embedding = nn.Embedding(num_embeddings=input_dim,
                                      embedding_dim=embed_dim,
                                      padding_idx=0
                                      )
        ### END
        self.rnn = nn.GRU(
                          input_size=embed_dim,  # if taking mean of embeddings
                          hidden_size=32,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=False,
                          )
            
        self.fc2 = nn.Linear(32 * 1, 1) # (hidden dimensions*(num_layers*bidirectional(2 if True else 1)), output dimensions)

    def forward(self, x):

        ### BEGIN Addition of an embedding layer to the network
        x = self.embedding(x)  # [batch_size, num_tokens_input, emb_dim]
        
        x = F.relu(x)
        ### END

        out, x = self.rnn(x)
        x = x.squeeze(0)
        x = F.relu(x)
        x = self.fc2(x)

        return x

### (minimal) LSTM
input -> embedding layer -> LSTM -> ReLU -> output layer -> output

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM(nn.Module):

    def __init__(self, 
                 input_dim,
                 seq_len, 
                 embed_dim
                 ):

        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.seq_len = seq_len
        super(LSTM, self).__init__()
        ### BEGIN Addition of an embedding layer to the network
        self.embedding = nn.Embedding(num_embeddings=input_dim,
                                      embedding_dim=embed_dim,
                                      padding_idx=0
                                      )
        ### END
        self.rnn = nn.LSTM(
                          input_size=embed_dim,  # if taking mean of embeddings
                          hidden_size=32,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=False,
                          )
            
        self.fc2 = nn.Linear(32 * 1, 1) # (hidden dimensions*(num_layers*bidirectional(2 if True else 1)), output dimensions)

    def forward(self, x):


        ### BEGIN Addition of an embedding layer to the network
        x = self.embedding(x)  # [batch_size, num_tokens_input, emb_dim]
        
        x = F.relu(x)
        ### END

        out, (x, c) = self.rnn(x)
        x = x.squeeze(0)
        x = F.relu(x)
        x = self.fc2(x)

        return x

### Different accuracy function
for binary labels

In [0]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    
    return acc

### Building the model
setting the optimizer, loss function, device

In [0]:
import torch
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, f1_score
import time

print("#build model")
# model = GRU(input_dim=vocabulary_size, seq_len=input_size, embed_dim=embed_dim)
model = LSTM(input_dim=vocabulary_size, seq_len=input_size, embed_dim=embed_dim)
print("#Model: {}".format(model))
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params=model.parameters())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: {}".format(device))
model.to(device)
criterion.to(device)

#build model
#Model: LSTM(
  (embedding): Embedding(40951, 100, padding_idx=0)
  (rnn): LSTM(100, 32, batch_first=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)
Device: cuda


BCEWithLogitsLoss()

### Training the model
setting amount of epochs, batch size

shuffling the training data each epoch

with validation set accuracy

In [0]:
print("#Training..")
model.train()
num_epochs = 100
size_batch = 100
early_stopping = 0
num_batches = round(len(X_train) / size_batch)
print("#Batch size: {}, num batches: {}".format(size_batch, num_batches))
best_epoch_acc = {'epoch': '', 'epoch_loss': '', 'epoch_acc': 0, 'y_pred': []}
for epoch in range(num_epochs):
    data = list(zip(X_train,y_train))
    np.random.shuffle(data)
    X_train, y_train = zip(*data)
    start = time.time()
    epoch_loss = 0
    epoch_acc_train = 0
    y_pred_list = []
    for batch in range(num_batches):
        batch_begin = batch*size_batch
        batch_end = (batch+1)*(size_batch)
        X_data = X_train[batch_begin:batch_end]
        y_data = y_train[batch_begin:batch_end]
        
        X_tensor = torch.tensor(X_data, dtype=torch.long)
        y_tensor = torch.tensor(y_data, dtype=torch.float32)
        X_tensor, y_tensor = X_tensor.to(device), y_tensor.to(device)
        
        optimizer.zero_grad()
        
        y_pred = model(X_tensor)
        loss = criterion(y_pred, y_tensor.unsqueeze(1))
        acc = binary_acc(y_pred, y_tensor.unsqueeze(1))
    
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc_train += acc.item()
        
    print("  End epoch {}. Average loss {:.3f}. Average acc {:.3f}. Time {:.2f} s".format(epoch+1, epoch_loss/num_batches, 
                                                                                          epoch_acc_train/num_batches, time.time()-start))
    

    print("  Validation")
    epoch_acc = 0
    num_batches_dev = round(len(X_dev) / size_batch)
    print("    #num batches dev: {}".format(num_batches_dev))
    with torch.no_grad():
      for batch in range(num_batches_dev):
          batch_dev_begin = batch*size_batch
          batch_dev_end = (batch+1)*(size_batch) if ((batch+1)*(size_batch)) <= len(X_dev) else len(X_dev)

          X_data_dev = X_dev[batch_dev_begin:batch_dev_end]
          y_data_dev = y_dev[batch_dev_begin:batch_dev_end]
          X_tensor_dev = torch.tensor(X_data_dev, dtype=torch.long)
          y_tensor_dev = torch.tensor(y_data_dev, dtype=torch.int64)
          X_tensor_dev, y_tensor_dev = X_tensor_dev.to(device), y_tensor_dev.to(device)
          
          y_pred_dev = model(X_tensor_dev)
          acc = binary_acc(y_pred_dev, y_tensor_dev.unsqueeze(1))
          y_pred_dev = torch.sigmoid(y_pred_dev)
          y_pred_dev = torch.round(y_pred_dev)
          
          epoch_acc += acc.item()
          y_pred_list.append(y_pred_dev.cpu().numpy())
      
      print("    {:.3f}".format(epoch_acc / num_batches_dev))
      if (epoch_acc / num_batches_dev) >= best_epoch_acc['epoch_acc']:
          torch.save(model, 'model.pt')
          best_epoch_acc['epoch'] = epoch+1
          best_epoch_acc['epoch_loss'] = epoch_loss / num_batches
          best_epoch_acc['epoch_acc'] = epoch_acc / num_batches_dev
          best_epoch_acc['y_pred'] = y_pred_list
          early_stopping = 0
      else:
        early_stopping += 1
        if early_stopping == 25:
          print("[!]Early stopping, no increase in last {} epochs".format(str(early_stopping))) 
          break

print("#Highest acc..")
print("  Epoch {}. Average loss {:.3f}. Average acc {:.3f}".format(best_epoch_acc['epoch'], best_epoch_acc['epoch_loss'], best_epoch_acc['epoch_acc']))
print("    {:.3f}".format(best_epoch_acc['epoch_acc']))

print("  #Classification report dev..")
y_preds_dev = [y.squeeze().tolist() for y_batch in best_epoch_acc['y_pred'] for y in y_batch]
print(classification_report(y_dev, y_preds_dev))

#Training..
#Batch size: 100, num batches: 90
  End epoch 1. Average loss 0.684. Average acc 0.580. Time 0.49 s
  Validation
    #num batches dev: 10
    0.573


  "type " + obj.__name__ + ". It won't be checked "


  End epoch 2. Average loss 0.681. Average acc 0.580. Time 0.49 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 3. Average loss 0.681. Average acc 0.580. Time 0.49 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 4. Average loss 0.680. Average acc 0.580. Time 0.48 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 5. Average loss 0.680. Average acc 0.580. Time 0.48 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 6. Average loss 0.680. Average acc 0.580. Time 0.49 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 7. Average loss 0.680. Average acc 0.580. Time 0.48 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 8. Average loss 0.678. Average acc 0.580. Time 0.48 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 9. Average loss 0.677. Average acc 0.580. Time 0.48 s
  Validation
    #num batches dev: 10
    0.574
  End epoch 10. Average loss 0.676. Average acc 0.583. Time 0.48 s
  Validation
    #num

### Testing the model
setting batch size to 1

adding sigmoid and round to get label (RNN output -> 0 or 1)

In [0]:
print("#Testing..")
# model = torch.load('/content/drive/My Drive/GRUmodel.pt')
model = torch.load('/content/drive/My Drive/LSTMmodel.pt')
model.eval()
size_batch = 1
num_batches_test = len(X_test) // size_batch
print("  #num batches test: {}".format(num_batches_test))
y_pred_list_test = []
with torch.no_grad():
  for batch in range(num_batches_test):
      batch_test_begin = batch*size_batch
      batch_test_end = (batch+1)*(size_batch)
      X_data_test = X_test[batch_test_begin:batch_test_end]
      y_data_test = y_test[batch_test_begin:batch_test_end]
      X_tensor_test = torch.tensor(X_data_test, dtype=torch.int64)
      y_tensor_test = torch.tensor(y_data_test, dtype=torch.int64)
      X_tensor_test, y_tensor_test = X_tensor_test.to(device), y_tensor_test.to(device)
      
      y_pred_test = model(X_tensor_test)
      y_pred_test = torch.sigmoid(y_pred_test)
      y_pred_test = torch.round(y_pred_test)
      
      y_pred_list_test.append(y_pred_test.cpu().numpy())

print("  #Classification report test..")
y_preds_test = [y.squeeze().tolist() for y_batch in y_pred_list_test for y in y_batch]
print(classification_report(y_test, y_preds_test))

#Testing..
  #num batches test: 860
  #Classification report test..
              precision    recall  f1-score   support

           0       0.75      0.81      0.78       620
           1       0.39      0.31      0.34       240

    accuracy                           0.67       860
   macro avg       0.57      0.56      0.56       860
weighted avg       0.65      0.67      0.66       860

