In [184]:
import io
import re
import torch
import datasets
import itertools 
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data.dataset import random_split
from collections import Counter
from nltk import ngrams

In [153]:
dataset = datasets.load_dataset('wili_2018')



  0%|          | 0/2 [00:00<?, ?it/s]

In [154]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 117500
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 117500
    })
})

In [155]:
dataset.shape

{'train': (117500, 2), 'test': (117500, 2)}

In [156]:
dataset.column_names

{'train': ['sentence', 'label'], 'test': ['sentence', 'label']}

In [181]:
train_dataset = dataset["train"]

In [158]:
train_dataset[0]

{'sentence': 'Klement Gottwaldi surnukeha palsameeriti ning paigutati mausoleumi. Surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemärke. 1962. aastal viidi ta surnukeha mausoleumist ära ja kremeeriti. Zlíni linn kandis aastatel 1949–1989 nime Gottwaldov. Ukrainas Harkivi oblastis kandis Zmiivi linn aastatel 1976–1990 nime Gotvald.',
 'label': 112}

In [159]:
def clean_text(text):
    text = re.sub(r"[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", "", text)
    text = re.sub(r"https?://[-_.?&~;+=/#0-9A-Za-z]+", "", text)
    text = re.sub(r"[\+\d]?(\d{2,3}[-\.\s]??\d{2,3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})", "", text)
    text = re.sub(r'[~^0-9]', '', text)
    text = re.sub(r"[-()`\"#$!/@%;:\\<>{}`+=~<=>\[\]|.!?,~_]", "", text)
    text = re.sub(r' +', ' ', text)
    return text

def tokenizer(text, ngram = 3):
    text = clean_text(text)
    return ["".join(k1) for k1 in list(ngrams(text, n=ngram))]

def tokenize_data(example, tokenizer):
    tokens = {'tokens': tokenizer(example['sentence'])}
    return tokens

In [160]:
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['sentence'], fn_kwargs={'tokenizer': tokenizer})

  0%|          | 0/117500 [00:00<?, ?ex/s]

  0%|          | 0/117500 [00:00<?, ?ex/s]

In [121]:
# tokenized_dataset['train'][0]

In [161]:
vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

PAD_IDX = vocab['<pad>']
BOS_IDX = vocab['<bos>']
EOS_IDX = vocab['<eos>']
UNK_IDX = vocab['<unk>']

vocab.set_default_index(UNK_IDX)

In [162]:
len(vocab)

384583

In [163]:
vocab.get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', 'an ', ' de', 'en ', 'de ', 'na ', 'ng ']

In [164]:
'hello' in vocab

False

In [165]:
'the' in vocab

True

In [166]:
s = '123here `is` https:www.hello.com\ the [55 an] example rashed091nsu@gmail.com'
print(clean_text(s))

here is httpswwwhellocom the an example 


In [167]:
tokenizer('123here is the an example')

['her',
 'ere',
 're ',
 'e i',
 ' is',
 'is ',
 's t',
 ' th',
 'the',
 'he ',
 'e a',
 ' an',
 'an ',
 'n e',
 ' ex',
 'exa',
 'xam',
 'amp',
 'mpl',
 'ple']

In [168]:
text_pipeline = lambda text: vocab(list(itertools.chain(['<bos>'], tokenizer(text), ['<eos>'])))

In [169]:
text_pipeline('123here is the an example')

[2,
 700,
 166,
 46,
 170,
 183,
 39,
 496,
 150,
 214,
 50,
 68,
 35,
 4,
 172,
 2042,
 7007,
 7895,
 887,
 2208,
 1802,
 3]

In [185]:
def generate_batch(data):
    batch = []
    labels = []
    for example in data:
        tokens = list(itertools.chain(['<bos>'], example['tokens'], ['<eos>']))
        data_tensor = torch.tensor([vocab[token] for token in tokens], dtype=torch.long)
        labels.append(example['label'])
        batch.append(data_tensor)
    batch = pad_sequence(batch, padding_value=PAD_IDX)
    labels = torch.tensor(labels, dtype=torch.long)
    return batch, labels

In [189]:
train_dataset = tokenized_dataset['train']
test_dataset = tokenized_dataset['test']

In [190]:
num_train = int(len(train_dataset) * 0.75)
split_train, split_valid = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [191]:
train_dataloader = DataLoader(split_train, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)

In [192]:
for _, (source, target) in enumerate(train_dataloader):
    print(source.shape)
    print(len(target))
    break

torch.Size([2142, 128])
128


In [193]:
valid_dataloader[0]

TypeError: 'DataLoader' object is not subscriptable

In [None]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)