In [59]:
import numpy as np
import torch
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split


In [70]:
BATCH_SIZE = 32
SEED = 10
dev = 'cpu'

spacy_en = spacy.load("en_core_web_sm")
tweets = []
classes = []

In [78]:
# Load OLID dataset split it and save it as csv files

tweets = []
classes = []
for line in open("offenseval-training-v1.tsv",'r',encoding='utf-8'):
    line = line.rstrip('\n').split('\t')
    tweets.append(line[1])
    classes.append(int(line[2]=='OFF'))

tweets = tweets[1:]
classes = classes[1:]
tweets_train, tweets_test, y_train, y_test = train_test_split(tweets, classes, test_size=0.2, random_state=42)

df_train = pd.DataFrame({'text': tweets_train, 'label': y_train})
df_test = pd.DataFrame({'text': tweets_test, 'label': y_test})

df_train.to_csv('offenseval_train.csv', index=False)
df_train.to_csv('offenseval_test.csv', index=False)

In [64]:
# Use Fields to create english vocab

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

ENGLISH = Field(sequential = True, use_vocab = True, tokenize=tokenizer, lower=True)
LABEL =LabelField(dtype=torch.long, batch_first=True, sequential=False)
fields = [('text', ENGLISH), ('label', LABEL)]

In [65]:
train_data, test_data = TabularDataset.splits(
    path = '',
    train='offenseval_train.csv',
    test='offenseval_test.csv',
    format='csv',
    fields=fields,
    skip_header=True,
)

In [79]:
# build vocabularies using training set
ENGLISH.build_vocab(train_data, max_size=10000, min_freq=2)
LABEL.build_vocab(train_data)

In [80]:
#Create train and test iterators to use during the training loop
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    device=dev
    )

In [81]:
#smol test
for batch_no, batch in enumerate(train_iterator):
    text = batch.text
    print(text)
    print(batch.label)

tensor([[   2,    2,    2,  ...,  103,    2,    2],
        [   2,   71,    2,  ...,    2,    2,  202],
        [   2, 1366,    2,  ...,    2,  246,   25],
        ...,
        [   1,    1,    3,  ...,   10,    1,    1],
        [   1,    1,   98,  ...,   51,    1,    1],
        [   1,    1,   24,  ...,    1,    1,    1]])
tensor([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 1, 0, 1])
tensor([[   2,  284,   10,  ...,    2,    2, 1351],
        [  15,    3, 5132,  ...,    2,    2,    0],
        [3525,   46,  100,  ...,   26,    2,  686],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 0, 0])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [161, 222,   2,  ...,   2,  45, 268],
        [  6,  34,  18,  ..., 