In [77]:
import re
import datasets
import itertools
import time
import math
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from nltk import ngrams
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split

In [2]:
dataset = datasets.load_dataset('wili_2018')

Reusing dataset wili_2018 (/home/rashed/.cache/huggingface/datasets/wili_2018/WiLI-2018 dataset/1.1.0/78d7fe4a9d0a01168e45657f302c776ee0afc0978d44e2c3759f4c4975b845f5)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 117500
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 117500
    })
})

In [4]:
dataset.shape

{'train': (117500, 2), 'test': (117500, 2)}

In [5]:
dataset.column_names

{'train': ['sentence', 'label'], 'test': ['sentence', 'label']}

In [7]:
train_dataset = dataset["train"]

In [8]:
train_dataset[0]

{'sentence': 'Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.',
 'label': 112}

In [9]:
# Clean text, tokenize, and create vocabulary from text data
punctuations = [
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
    '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_',
    '`', '{', '|', '}', '~', '»', '«', '“', '”', "-",
]

def clean_text(text):
    text = re.sub(r"[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", "", text)
    text = re.sub(r"https?://[-_.?&~;+=/#0-9A-Za-z]+", "", text)
    text = re.sub(r"[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})", "", text)
    text = re.sub(r'[~^0-9]', '', text)
    text = re.sub(r"([" + re.escape("".join(punctuations)) + "])", "", text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

def tokenizer(text, ngram = 3):
    text = clean_text(text)
    return ["".join(k1) for k1 in list(ngrams(text, n=ngram))]

def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['sentence'])}
    return tokens

In [10]:
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['sentence'], fn_kwargs={'tokenizer': tokenizer})

  0%|          | 0/117500 [00:00<?, ?ex/s]

  0%|          | 0/117500 [00:00<?, ?ex/s]

In [12]:
#tokenized_dataset['train'][0]

In [13]:
vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

PAD_IDX = vocab['<pad>']
BOS_IDX = vocab['<bos>']
EOS_IDX = vocab['<eos>']
UNK_IDX = vocab['<unk>']

vocab.set_default_index(UNK_IDX)

In [14]:
len(vocab)

378076

In [15]:
vocab.get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', 'an ', ' de', 'en ', 'de ', 'na ', 'ng ']

In [16]:
'hello' in vocab

False

In [17]:
'the' in vocab

True

In [None]:
s = '123here `is` https:www.hello.com\ the [55 an] example rashed091nsu@gmail.com'
print(clean_text(s))

here is httpswwwhellocom the an example


In [None]:
tokenizer('123here is the an example')

['her',
 'ere',
 're ',
 'e i',
 ' is',
 'is ',
 's t',
 ' th',
 'the',
 'he ',
 'e a',
 ' an',
 'an ',
 'n e',
 ' ex',
 'exa',
 'xam',
 'amp',
 'mpl',
 'ple']

In [None]:
text_pipeline = lambda text: vocab(list(itertools.chain(['<bos>'], tokenizer(text), ['<eos>'])))

In [None]:
text_pipeline('123here is the an example')

[2,
 699,
 169,
 45,
 170,
 185,
 39,
 460,
 152,
 217,
 52,
 66,
 34,
 4,
 167,
 2051,
 6986,
 7861,
 892,
 2219,
 1806,
 3]

In [57]:
def generate_batch(data):
    batch = []
    labels = []
    seq_list = []
    for example in data:
        tokens = list(itertools.chain(['<bos>'], example['tokens'], ['<eos>']))
        data_tensor = torch.tensor([vocab[token] for token in tokens], dtype=torch.long)
        labels.append(example['label'])
        seq_list.append(len(example['tokens']))
        batch.append(data_tensor)
    batch = pad_sequence(batch, batch_first=True, padding_value=PAD_IDX)
    labels = torch.tensor(labels, dtype=torch.long)
    seq_list = torch.tensor(seq_list, dtype=torch.long)
    return batch, labels, seq_list

In [58]:
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

In [59]:
num_train = int(len(train_dataset) * 0.75)
split_train, split_valid = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [60]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 128 # batch size for training
VOCAB_SIZE = len(vocab) # number of vocabulary
EMB_SIZE = 512 # Embedding size
HID_DIM = 512 # Hidden dimension
NUM_LAYERS = 2 # Number of layers
NUM_CLASS = 235 # Number of output classess
DROP_RATE = 0.3

In [61]:
train_dataloader = DataLoader(split_train, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)

# Model

In [62]:
source, target, seq = next(iter(train_dataloader))

In [63]:
source

tensor([[     2, 103974,  16661,  ...,      1,      1,      1],
        [     2,   5570,  11789,  ...,      1,      1,      1],
        [     2,   2931,    363,  ...,      1,      1,      1],
        ...,
        [     2,  59677,    171,  ...,      1,      1,      1],
        [     2,  37383,  15621,  ...,      1,      1,      1],
        [     2,    826,    350,  ...,      1,      1,      1]])

In [65]:
source.shape

torch.Size([128, 2363])

In [66]:
target

tensor([231, 103, 158, 215,  58, 198,  23,  49,  32, 142, 194,   0,  34, 199,
         96,  32, 148, 124, 230,  43,  99, 156, 229, 174, 209,  46,  97, 227,
        192,  43, 148, 203, 232, 139,  27,  22, 172, 181, 193, 225, 134, 147,
        164, 230, 129, 139, 149, 188, 101, 152, 148,  77,  15,  62, 152,  54,
        222,  80, 225, 126,   1, 226, 163, 201, 129, 218,  46, 107,  21, 203,
        205, 212, 194, 119, 234, 149,  23, 233,  39, 165,  42, 186,  67, 120,
         68, 103, 121, 150,   1, 111,  69, 207, 158, 161,  49, 180,  76, 233,
         12, 114,  85,  48, 145,  27, 194,  29,  78,  67, 118, 234, 181,  53,
        185,  62,   6, 161, 148,  86, 153, 174,  88,  38, 118,  75,  47, 155,
         50, 157])

In [67]:
target.shape

torch.Size([128])

In [68]:
seq

tensor([ 182,  364,  147,  149,  220,  848,  280,  175,  212,  153,  662,  161,
         169,  189,  417,  151,  158,  402,  164,  144,  345,  121,  134,  184,
         144,  448,  270, 1257,  137,  247,  388,  164,  241,  297,  205,  194,
         182,  200,  182,  172,  355, 1067,  508,  145,  238,  263,  154,  521,
         160,  316,  202,  202,  219,  270,  321,  417,  388,  201,  250,  822,
         214,  402,  466,  226,  411, 1080,  171,  321,  603,  887,  301,  146,
         219,  152,  223,  177,  409,  899,  645,  347,  338,  500,  123,  200,
         182, 1764,  661,  139,  150,  208,  344,  599,  379,  271,  381,  125,
         124,  172,  350,  266,  211,  160,  124,  136,  176, 1064,  852,  149,
         500,  267,  164,  650,  154,  201, 2361,  204,  310,  241,  303,  540,
         130,  198,  233,  361,  165,  439,  213,  711])

In [69]:
seq.shape

torch.Size([128])

In [71]:
embed = nn.Embedding(VOCAB_SIZE, EMB_SIZE)
lstm = nn.LSTM(EMB_SIZE, HID_DIM, num_layers=NUM_LAYERS, dropout=DROP_RATE, batch_first=True)
fc = nn.Linear(HID_DIM, NUM_CLASS)
dropout = nn.Dropout(0.3)

In [72]:
embedded_seq_tensor = embed(source)

In [73]:
embedded_seq_tensor

tensor([[[ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
         [-0.6409,  0.7285,  1.3829,  ...,  2.6079, -0.3562, -0.4943],
         [ 1.7493, -1.6336,  2.9278,  ..., -0.2729, -0.2803, -1.6111],
         ...,
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916],
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916],
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916]],

        [[ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
         [-0.5635, -0.6191, -0.2675,  ...,  1.0145, -1.6781,  0.9973],
         [ 0.2728, -0.8690,  0.1779,  ...,  1.3779,  1.5374, -0.6691],
         ...,
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916],
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916],
         [ 0.6097, -0.0311,  0.3657,  ..., -0.9129,  1.0052, -2.4916]],

        [[ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
         [ 0.1665, -2.2465,  0.4283,  ...,  1

In [75]:
embedded_seq_tensor.shape

torch.Size([128, 2363, 512])

In [78]:
packed_input = pack_padded_sequence(embedded_seq_tensor, seq, batch_first=True, enforce_sorted=False)

In [79]:
packed_input.data

tensor([[ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
        [ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
        [ 0.4033, -0.0790,  1.5855,  ...,  0.1375,  0.7096, -0.6033],
        ...,
        [-0.4241, -0.0841, -1.9274,  ..., -1.4556,  1.2830,  1.3497],
        [ 0.9698, -0.0978, -0.1560,  ..., -0.3862, -0.4384, -0.9821],
        [-0.0101,  2.1456, -0.3731,  ...,  0.5410, -0.5722,  2.7503]],
       grad_fn=<PackPaddedSequenceBackward0>)

In [81]:
packed_input.batch_sizes

tensor([128, 128, 128,  ...,   1,   1,   1])

In [94]:
lstm_output, (hidden, cell) = lstm(packed_input)

In [100]:
lstm_output.data

tensor([[-0.0130,  0.0170,  0.0042,  ..., -0.0211, -0.0070,  0.0016],
        [-0.0279, -0.0022,  0.0054,  ..., -0.0298, -0.0022,  0.0237],
        [-0.0143, -0.0044, -0.0220,  ..., -0.0179,  0.0070,  0.0218],
        ...,
        [-0.0629, -0.0128,  0.0307,  ..., -0.0618, -0.0635, -0.0100],
        [-0.0399, -0.0089,  0.0237,  ..., -0.0756, -0.0470,  0.0027],
        [-0.0161, -0.0304, -0.0073,  ..., -0.0800, -0.0491,  0.0042]],
       grad_fn=<CatBackward0>)

In [96]:
lstm_output.batch_sizes

tensor([128, 128, 128,  ...,   1,   1,   1])

In [105]:
prediction = fc(hidden[-1])

In [106]:
prediction.shape

torch.Size([128, 235])

In [99]:
prediction.data

tensor([[ 0.0159, -0.0030,  0.0556,  ...,  0.0695, -0.0131,  0.0396],
        [-0.0236, -0.0388,  0.0204,  ...,  0.0830, -0.0170,  0.0089],
        [ 0.0070, -0.0023,  0.0450,  ...,  0.0462,  0.0096,  0.0339],
        ...,
        [-0.0078, -0.0404,  0.0363,  ...,  0.0278,  0.0106,  0.0179],
        [-0.0136, -0.0186,  0.0362,  ...,  0.0460,  0.0274,  0.0020],
        [-0.0267, -0.0139,  0.0236,  ...,  0.0239, -0.0156,  0.0163]])

In [110]:
F.cross_entropy(prediction, target)

tensor(5.4586, grad_fn=<NllLossBackward0>)

In [111]:
prediction.argmax(1)

tensor([128, 190,  60, 232,  37, 202, 148, 210, 130, 177,  47, 228, 155, 142,
        232,  37, 232, 190, 112,  37,  37,  73, 166, 148, 127, 191, 107, 232,
        202,  37, 166,  37, 173,  11, 185,  89, 155, 228, 215, 228,  37, 148,
        168, 152, 202,  43,  63, 191, 148,  32, 232, 130,  29, 148,   6, 228,
         46, 107, 232, 107, 228, 215, 228, 170,  37, 232, 148, 210, 228, 202,
        219,  73, 202, 148, 144, 190, 127,  24,  24,  37,  55,  32,  37,  66,
         37, 228,  37, 232,  37,  37,  37, 145,  37,  37,  37,  11,  37,  11,
        155, 120, 155, 228,  37, 144,  89,  73,  55, 203, 210,  37, 148, 148,
         11, 219,  37, 190,  32,  24, 190, 120, 202, 220, 145, 232, 130,  37,
         46, 155])

In [112]:
(prediction.argmax(1) == target)

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False])