In [292]:
!pip install datasets evaluate --upgrade
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [293]:
import torch
import torch.nn as nn
#import torch.optim as optim
import random
import numpy as np
import spacy
import datasets # Hugging face datasets
from datasets import Dataset as h_dataset
import torchtext
from tqdm import tqdm
from torch.utils.data import DataLoader
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Load dataset from Huggingface

In [294]:
 ## load dataset
dataset = datasets.load_dataset("bentrevett/multi30k")

In [295]:
#print('train data length is {}')
print(len(dataset["train"].data['en']))
print(dataset["train"]['en'][:10])
print(dataset["train"]['de'][:10])

29000
['Two young, White males are outside near many bushes.', 'Several men in hard hats are operating a giant pulley system.', 'A little girl climbing into a wooden playhouse.', 'A man in a blue shirt is standing on a ladder cleaning a window.', 'Two men are at the stove preparing food.', 'A man in green holds a guitar while the other man observes his shirt.', 'A man is smiling at a stuffed lion', 'A trendy girl talking on her cellphone while gliding slowly down the street.', 'A woman with a large purse is walking by a gate.', 'Boys dancing on poles in the middle of the night.']
['Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.', 'Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.', 'Ein kleines Mädchen klettert in ein Spielhaus aus Holz.', 'Ein Mann in einem blauen Hemd steht auf einer Leiter und putzt ein Fenster.', 'Zwei Männer stehen am Herd und bereiten Essen zu.', 'Ein Mann in grün hält eine Gitarre, während der andere Mann sein Hemd ansieht.', 'Ei

In [296]:
    ## load English and German languate with spacy ##
    ### spacy is used to process the whole sentence
    en_nlp = spacy.load("en_core_web_sm")
    de_nlp = spacy.load("de_core_news_sm")

In [297]:
#https://realpython.com/natural-language-processing-spacy-python/
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = en_nlp(text)
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']


# Tokenizing Process


In [298]:
en_tokens = [token.text for token in en_nlp.tokenizer(str(dataset["train"]["en"][0]))][:max_length]
#en_nlp.tokenizer(str(train_data.data["en"][0]))
print(en_tokens)

['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [299]:
def tokenize_sentences(data, en_nlp, de_nlp, lower_ch = True, max_length=1000, sos_token='<sos>', eos_token = '<eos>'):
  new_data_with_tokens = {'en_tokens':[], 'de_tokens':[]}

  for data_en_i, data_de_i in tqdm(zip(data['en'], data['de']), total = len(data['en']), ascii=True, desc = 'number of sentences'):

    en_tokens = [token.text for token in en_nlp.tokenizer(str(data_en_i))][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(str(data_de_i))][:max_length]
    if lower_ch is True:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]

    en_tokens = [sos_token] + en_tokens + [eos_token] # append start and end tokens
    de_tokens = [sos_token] + de_tokens + [eos_token]

    new_data_with_tokens['en_tokens'].append(en_tokens)
    new_data_with_tokens['de_tokens'].append(de_tokens)

  return new_data_with_tokens

In [300]:
train_data, test_data, valid_data = dataset["train"], dataset["validation"], dataset['test']
#### update train, test, and valid data with tokenizing sentences ###
nlp_process_tools = {'en_nlp':en_nlp, 'de_nlp':de_nlp} # NLP processing spacy tools
train_data = tokenize_sentences(train_data, **nlp_process_tools)
test_data = tokenize_sentences(test_data, **nlp_process_tools)
valid_data = tokenize_sentences(valid_data, **nlp_process_tools)

number of sentences: 100%|##########| 29000/29000 [00:06<00:00, 4510.63it/s]
number of sentences: 100%|##########| 1014/1014 [00:00<00:00, 3409.88it/s]
number of sentences: 100%|##########| 1000/1000 [00:00<00:00, 3633.18it/s]


In [301]:
print(train_data["en_tokens"][:100])

[['<sos>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>'], ['<sos>', 'several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.', '<eos>'], ['<sos>', 'a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.', '<eos>'], ['<sos>', 'a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.', '<eos>'], ['<sos>', 'two', 'men', 'are', 'at', 'the', 'stove', 'preparing', 'food', '.', '<eos>'], ['<sos>', 'a', 'man', 'in', 'green', 'holds', 'a', 'guitar', 'while', 'the', 'other', 'man', 'observes', 'his', 'shirt', '.', '<eos>'], ['<sos>', 'a', 'man', 'is', 'smiling', 'at', 'a', 'stuffed', 'lion', '<eos>'], ['<sos>', 'a', 'trendy', 'girl', 'talking', 'on', 'her', 'cellphone', 'while', 'gliding', 'slowly', 'down', 'the', 'street', '.', '<eos>'], ['<sos>', 'a', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.', '<eos

In [302]:
total_data = dict()
total_data['en_tokens'] = train_data_tokens['en_tokens'] + test_data_tokens['en_tokens'] + valid_data_tokens['en_tokens']
print(len( train_data_tokens['en_tokens']+test_data_tokens['en_tokens']))
print(len(total_data['en_tokens']))

30014
31014


# Create vocabulary set

In [303]:
min_freq = 2
unk_token = "<unk>" # unknown token, which would be used if we find any word that is out of vocabulary set.
pad_token = "<pad>" # pad token, to make the length of each setence the same

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

# https://pytorch.org/text/stable/vocab.html
  # torchtext.vocab.build_vocab_from_iterator
  # Build a Vocab from an iterator
  # iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.
  #min_freq – The minimum frequency needed to include a token in the vocabulary.
  #specials – Special symbols to add. The order of supplied tokens will be preserved.
  #special_first – Indicates whether to insert symbols at the beginning or at the end.
  #max_tokens – If provided, creates the vocab from the max_tokens - len(specials) most frequent

  ## return : A Vocab object

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"], # list
    min_freq=min_freq,
    specials=special_tokens,
)

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"], # list
    min_freq=min_freq,
    specials=special_tokens,
)

assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

### torchtext.vocab.set_default_index
## Value of default index. This index will be returned when OOV token is queried.
## OOV: Out of Vocabulary tokens
en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)


### Let's investigate en_vocab and de_vocab together

---



In [304]:
# for all the attributes and functions of the vocab class, look at the below link
#https://pytorch.org/text/stable/vocab.html
# for example, let's find the indices of 'like' and 'man' from 'en-vocab' class
en_vocab.lookup_indices(['like', 'man'])

[340, 9]

In [305]:
# map tokens of sentences to the corresponding indices in vocabulary dictionary
def map_tokens_to_indices(vocab, en_vocab_dict, de_vocab_dict):

  vocab_indices = dict(en_ids = [], de_ids = [])

  for en_vocab, de_vocab in tqdm(zip(vocab['en_tokens'], vocab['de_tokens']), desc = 'number of setences', ascii= True, total = len(vocab['en_tokens'])):
    en_ids = en_vocab_dict.lookup_indices(en_vocab)
    de_ids = de_vocab_dict.lookup_indices(de_vocab)

    vocab_indices['en_ids'].append(en_ids)
    vocab_indices['de_ids'].append(de_ids)

  return vocab_indices

In [306]:
if 1:
  train_data_setences, test_data_setences, valid_data_setences = dataset["train"], dataset["validation"], dataset['test']
  #### update train, test, and valid data with tokenizing sentences ###
  nlp_process_tools = {'en_nlp':en_nlp, 'de_nlp':de_nlp} # NLP processing spacy tools
  train_data_tokens = tokenize_sentences(train_data_setences, **nlp_process_tools)
  test_data_tokens = tokenize_sentences(test_data_setences, **nlp_process_tools)
  valid_data_tokens = tokenize_sentences(valid_data_setences, **nlp_process_tools)

vocabulary_dictionaries = {'en_vocab_dict':en_vocab, 'de_vocab_dict':de_vocab}
## update train, test, valid data by mapping tokens to indices
train_data_ids = map_tokens_to_indices(train_data_tokens, **vocabulary_dictionaries)
test_data_ids = map_tokens_to_indices(test_data_tokens, **vocabulary_dictionaries)
valid_data_ids = map_tokens_to_indices(valid_data_tokens, **vocabulary_dictionaries)

number of sentences: 100%|##########| 29000/29000 [00:02<00:00, 14213.80it/s]
number of sentences: 100%|##########| 1014/1014 [00:00<00:00, 15156.62it/s]
number of sentences: 100%|##########| 1000/1000 [00:00<00:00, 14929.86it/s]
number of setences: 100%|##########| 29000/29000 [00:02<00:00, 13816.70it/s]
number of setences: 100%|##########| 1014/1014 [00:00<00:00, 62270.67it/s]
number of setences: 100%|##########| 1000/1000 [00:00<00:00, 67318.90it/s]


In [307]:
## print examples of token-to-indice mapped setences
print('################ original english and german sentences ###################')
print(train_data_setences['en'][0])
print(train_data_setences['de'][0])
print('############## after tokenizing the sentences ############')
print(train_data_tokens['en_tokens'][0])
print(train_data_tokens['de_tokens'][0])
print(' ################# after mapping tokens to indcies ########### ')
print(train_data_ids['en_ids'][0]) # indices by English dictionary
print(train_data_ids['de_ids'][0]) # indices by German dictionary

################ original english and german sentences ###################
Two young, White males are outside near many bushes.
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
############## after tokenizing the sentences ############
['<sos>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>']
['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '<eos>']
 ################# after mapping tokens to indcies ########### 
[2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3]
[2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]


In [308]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]
train_data_ids = h_dataset.from_dict(train_data_ids)
test_data_ids = h_dataset.from_dict(test_data_ids)
valid_data_ids = h_dataset.from_dict(valid_data_ids)

train_data_ids = train_data_ids.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

test_data_ids = test_data_ids.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data_ids = valid_data_ids.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)


## Padding to all the setences so that the length of each sentence is the same


In [309]:
########### get collate_function ####
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }

        return batch
    print(pad_index)
    return collate_fn

### create dataloader ####
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

# Create DataLoader to train model

In [310]:
batch_size = 128
# create dataloader for train, test, valid data

train_data_loader = get_data_loader(train_data_ids, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data_ids, batch_size, pad_index)
test_data_loader = get_data_loader(test_data_ids, batch_size, pad_index)

1
1
1


# Now, ready to train NLP models

In [None]:
## check data with dataloader
print(len(tarin_dataloader))
for i, train_data in enumerate(train_data_loader):
  print(train_data['en_ids'].shape)


In [None]:

#### extra function ###########
def add_pads_to_setences(data, pad_value = 1):

  L = len(data['en_ids'])

  assert L == len(data['de_ids'])

  _en_ids = [torch.tensor(data['en_ids'][i]) for i in range(L)]
  _de_ids = [torch.tensor(data['de_ids'][i]) for i in range(L)]

  en_ids_with_pads = nn.utils.rnn.pad_sequence(_en_ids, padding_value = pad_value)
  de_ids_with_pads = nn.utils.rnn.pad_sequence(_de_ids, padding_value = pad_value)

  return {'en_ids':en_ids_with_pads.T, 'de_ids':de_ids_with_pads.T}

pad_index = en_vocab[pad_token]
## add pads to all the sentences so that the length of each setence is the same
train_data = add_pads_to_setences(train_data_ids, pad_value = pad_index)
test_data = add_pads_to_setences(test_data_ids, pad_value = pad_index)
valid_data = add_pads_to_setences(valid_data_ids, pad_value = pad_index)