In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

import torch
import pandas as pd
from src.Vocab import Vocab
from collections import Counter

In [2]:
data = pd.read_csv("../data/rec05_small_en_fr.csv")


en_counter = Counter()
fr_counter = Counter()
for sentence in data["EN"]:
    en_counter.update(sentence.split())

for sentence in data["FR"]:
    fr_counter.update(sentence.split())

vocabulary_en = Vocab()
vocabulary_fr = Vocab()
vocabulary_en.build(en_counter)
vocabulary_fr.build(fr_counter)


print(f"Vocab-Size: {len(vocabulary_en)}")
print(f"Vocab-Size: {len(vocabulary_fr)}")
print("Index of '<unk>': ", vocabulary_en.word2idx["<unk>"])
print("Index of '<unk>': ", vocabulary_fr.word2idx["<unk>"])
print(f"First Word (Index 0): ", vocabulary_en.idx2word[0])
print(f"First Word (Index 0): ", vocabulary_fr.idx2word[0])
for i in range(5, 10):
    print("English: ", vocabulary_en.idx2word[i])
    print("France: ", vocabulary_fr.idx2word[i])


Vocab-Size: 3760
Vocab-Size: 5390
Index of '<unk>':  3
Index of '<unk>':  3
First Word (Index 0):  <pad>
First Word (Index 0):  <pad>
English:  i
France:  je
English:  m
France:  suis
English:  you
France:  est
English:  re
France:  il
English:  he
France:  vous


In [3]:
data["EN"][0]

'i m at a loss for words .'

In [4]:
sentence = data["EN"][0]
sentence_idx = vocabulary_en.sentence_to_idx(sentence)
print(sentence_idx)
sentece_words = vocabulary_en.idx_to_sentence(sentence_idx)
print(sentece_words)



[1, 5, 6, 28, 11, 702, 25, 517, 4, 2]
['i', 'm', 'at', 'a', 'loss', 'for', 'words', '.']


In [5]:
from src.Translation_Data import Translation_Data, collate
from torch.utils.data import Dataset

translation_set = Translation_Data(data, vocabulary_en, vocabulary_fr)

input_tensor, output_tensor = translation_set[0]
print(input_tensor.tolist())
print(output_tensor.tolist())

print(vocabulary_en.idx_to_sentence(input_tensor.tolist()))
print(vocabulary_fr.idx_to_sentence(output_tensor.tolist()))

[1, 5, 6, 28, 11, 702, 25, 517, 4, 2]
[1, 29, 20, 1078, 105, 801, 4, 2]
['i', 'm', 'at', 'a', 'loss', 'for', 'words', '.']
['j', 'en', 'perds', 'mes', 'mots', '.']


In [6]:
batch = [
    (torch.tensor([1, 5, 6], dtype=torch.long),
     torch.tensor([1, 7, 8, 2], dtype=torch.long)),
    (torch.tensor([1, 3], dtype=torch.long),
     torch.tensor([1, 4, 2], dtype=torch.long)),
]

pad_idx = 0 

out = collate(batch, special_idx=pad_idx)

print("input.shape:",  out["input"].shape)    
print("output.shape:", out["output"].shape)  

print("input:\n",  out["input"])
print("output:\n", out["output"])

print("input_mask:\n",  out["input_mask"])
print("output_mask:\n", out["output_mask"])

input.shape: torch.Size([2, 3])
output.shape: torch.Size([2, 4])
input:
 tensor([[1, 5, 6],
        [1, 3, 0]])
output:
 tensor([[1, 7, 8, 2],
        [1, 4, 2, 0]])
input_mask:
 tensor([[ True,  True,  True],
        [ True,  True, False]])
output_mask:
 tensor([[ True,  True,  True,  True],
        [ True,  True,  True, False]])


In [11]:
from torch.utils.data import DataLoader

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device used: ", device)

train_loader = DataLoader(
    translation_set, batch_size=32, shuffle=True, collate_fn=lambda batch: collate(batch, special_idx=vocabulary_en.word2idx["<pad>"])
)

# Smoke test: fetch one batch and move to device
batch = next(iter(train_loader))
input_batch = batch["input"].to(device)
output_batch = batch["output"].to(device)
input_mask  = batch["input_mask"].to(device)
output_mask  = batch["output_mask"].to(device)
print(f"Train batch shapes after moving to device: input={input_batch.shape}, output={output_batch.shape}")

Device used:  mps
Train batch shapes after moving to device: input=torch.Size([32, 11]), output=torch.Size([32, 11])
