In [13]:
import numpy as np
import tqdm
import torch
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


In [14]:
BATCH_SIZE = 128
SEED = 10
dev = 'cuda'

spacy_en = spacy.load("en_core_web_sm")
tweets = []
classes = []

In [15]:
torch.cuda.is_available()

True

In [75]:
# Load OLID dataset split it and save it as csv files

tweets = []
classes = []
for line in open("data/offenseval-training-v1.tsv",'r',encoding='utf-8'):
    line = line.rstrip('\n').split('\t')
    tweets.append(line[1])
    classes.append(int(line[2]=='OFF'))

tweets = tweets[1:]
classes = classes[1:]
tweets_train, tweets_test, y_train, y_test = train_test_split(tweets, classes, test_size=0.2, random_state=42)

df_train = pd.DataFrame({'text': tweets_train, 'label': y_train})
df_test = pd.DataFrame({'text': tweets_test, 'label': y_test})

df_train.to_csv('data/offenseval_train.csv', index=False)
df_test.to_csv('data/offenseval_test.csv', index=False)

In [69]:
# Use Fields to create english vocab

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ENGLISH = Field(sequential = True, use_vocab = True, tokenize=tokenizer, lower=True)
LABEL =LabelField(dtype=torch.long, batch_first=True, sequential=False)
fields = [('text', ENGLISH), ('label', LABEL)]

In [70]:
train_data, test_data = TabularDataset.splits(
    path = '',
    train='data/offenseval_train.csv',
    test='data/offenseval_test.csv',
    format='csv',
    fields=fields,
    skip_header=True,
)

In [71]:
# build vocabularies using training set
ENGLISH.build_vocab(train_data, max_size=10000, min_freq=2)
LABEL.build_vocab(train_data)

In [72]:
#Create train and test iterators to use during the training loop
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    shuffle=True,
    device=dev,
    batch_sizes=(128, 64),
    sort = False,
    )

In [73]:
class BasicLSTM(nn.Module):
    def __init__(self, dim_emb = 300, num_words = ENGLISH.vocab.__len__(), hidden_dim = 128, num_layers = 2):
        super(BasicLSTM, self).__init__()
        self.emb = nn.Embedding(num_words, dim_emb, padding_idx=1 )
        self.lstm = nn.LSTM(dim_emb, hidden_dim, num_layers, batch_first = False)
        self.lin = nn.Linear(hidden_dim, 1)

    def forward(self, x):

        X_mask = (torch.where(x==1,0,1))

        emb = self.emb(x)

        X_packed = pack_padded_sequence(emb, X_mask.sum(axis=0).tolist(), batch_first=False,enforce_sorted=False)
        
        lstm_hidden = self.lstm(X_packed)[1][0][-1]

        return torch.sigmoid(self.lin(torch.tanh(lstm_hidden))).view(-1)


def train(model, train_iterator, test_iterator, num_epochs=10):

    ct = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    for epoch in tqdm.notebook.trange(num_epochs, desc="training", unit="epoch"):
        
        
        with tqdm.notebook.tqdm(
                train_iterator,
                desc="epoch {}".format(epoch + 1),
                unit="batch",
                total=len(train_iterator)) as batch_iterator:
            model.train()
            total_loss = 0.0
            
            for i, batch in enumerate(batch_iterator, start=1):

                X = batch.text.cuda()
                y = batch.label.float().cuda()

                model.zero_grad()

                preds = model.forward(X)
                loss = ct(preds,y)
                total_loss += loss.item()
                loss.backward()
                optimizer.step()

                batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item())
            
        train_preds = []
        train_y = []
        test_preds = []
        test_y = []

        for batch in train_iterator:
            preds = model.forward(batch.text)
            preds = torch.where(preds >0.5, 1, 0).tolist()
            train_preds += preds 
            train_y += batch.label.tolist()

        for batch in test_iterator:
            preds = model.forward(batch.text)
            #print(preds)
            preds = torch.where(preds >0.5, 1, 0).tolist()
            test_preds += preds 
            test_y += batch.label.tolist()


        train_acc = f1_score(train_y, train_preds)
        print(len(train_y))
        print(len(test_y))
        test_acc = f1_score(test_y, test_preds)  
        print('epoch loss:',total_loss)
        print('epoch train f1:',train_acc)
        print('epoch test f1:',test_acc)



In [74]:
model = BasicLSTM().cuda()
train(model, train_iterator, test_iterator, num_epochs=10)

training:   0%|          | 0/10 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 50.48093780875206
epoch train f1: 0.3592924419940271
epoch test f1: 0.2872628726287263


epoch 2:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 38.54526227712631
epoch train f1: 0.8145432325084021
epoch test f1: 0.5899454875832829


epoch 3:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 25.509101763367653
epoch train f1: 0.9122302158273381
epoch test f1: 0.6035983749274522


epoch 4:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 15.615582935512066
epoch train f1: 0.9470110701107011
epoch test f1: 0.5606060606060607


epoch 5:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 10.36654094234109
epoch train f1: 0.9739455880236073
epoch test f1: 0.5876288659793815


epoch 6:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 5.607175320386887
epoch train f1: 0.9816234987700766
epoch test f1: 0.5567632850241546


epoch 7:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 4.970087114721537
epoch train f1: 0.982561463693539
epoch test f1: 0.5717488789237668


epoch 8:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 4.695627027191222
epoch train f1: 0.9877961234745154
epoch test f1: 0.5610328638497653


epoch 9:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 2.948254620656371
epoch train f1: 0.9943941354031911
epoch test f1: 0.5524926686217009


epoch 10:   0%|          | 0/83 [00:00<?, ?batch/s]

10592
2648
epoch loss: 1.7566568199545145
epoch train f1: 0.9954022988505747
epoch test f1: 0.5663716814159292
