<a href="https://colab.research.google.com/github/southstone0201/Transformer/blob/main/dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from datasets import load_dataset

dataset = load_dataset("iwslt2017",'iwslt2017-en-de')

Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/206112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8079 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/888 [00:00<?, ? examples/s]

In [None]:
import sentencepiece as spm

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 206112
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 8079
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 888
    })
})


In [None]:
with open("iwslt2017-en.txt", "w") as f_en, open("iwslt2017-de.txt", "w") as f_de:
    for it in dataset["train"]:
        f_en.write(it["translation"]["en"] + "\n")
        f_de.write(it["translation"]["de"] + "\n")

In [None]:
with open('iwslt2017-en.txt', 'r') as f:
    en = f.readlines()
with open('iwslt2017-de.txt', 'r') as f:
    de = f.readlines()

In [None]:
import pandas as pd
train_df=pd.DataFrame()
train_df['en']=pd.DataFrame(en)
train_df['de']=pd.DataFrame(de)

In [None]:
train_df

Unnamed: 0,en,de
0,"Thank you so much, Chris.\n","Vielen Dank, Chris.\n"
1,And it's truly a great honor to have the oppor...,"Es ist mir wirklich eine Ehre, zweimal auf die..."
2,"I have been blown away by this conference, and...",Ich bin wirklich begeistert von dieser Konfere...
3,"And I say that sincerely, partly because I ne...","Das meine ich ernst, teilweise deshalb -- weil..."
4,Put yourselves in my position.\n,Versetzen Sie sich mal in meine Lage!\n
...,...,...
206107,The Ancient Greeks didn't just wake up one day...,Den alten Griechen fiel auch nicht eines Tages...
206108,"It took centuries, even, for humans to realize...","Wir Menschen brauchten Jahrhunderte, um Sachen..."
206109,And so we must continuously challenge our noti...,Deshalb müssen wir unser Verständnis von Norma...
206110,Characters: Thank you. Thank you. Thank you. T...,Figuren: Danke. Danke. Danke. Danke. Danke.\n


In [None]:
spm.SentencePieceTrainer.Train(
    input='iwslt2017-en.txt',
    model_prefix='en-sp',
    vocab_size=10000,
    model_type='bpe' ,
    )

In [None]:
sp_en = spm.SentencePieceProcessor(model_file='./en-sp.model')

In [None]:
train_df['en_encoded']=train_df['en'].apply(lambda x:sp_en.encode(x))



In [None]:
spm.SentencePieceTrainer.train(
    input='iwslt2017-de.txt',
    model_prefix='de-sp',
    model_type="bpe",
    vocab_size=10000,
    )

In [None]:
sp_de = spm.SentencePieceProcessor(model_file='./de-sp.model')

In [None]:
train_df['de_encoded']=train_df['de'].apply(lambda x:sp_de.encode(x))

In [None]:
print(train_df)

                                                       en  ...                                         de_encoded
0                             Thank you so much, Chris.\n  ...                      [1269, 662, 9937, 3281, 9940]
1       And it's truly a great honor to have the oppor...  ...  [183, 61, 336, 402, 113, 8264, 9937, 6815, 116...
2       I have been blown away by this conference, and...  ...  [145, 502, 402, 5014, 109, 337, 4789, 9937, 35...
3       And I say that sincerely, partly because  I ne...  ...  [179, 408, 79, 3000, 9937, 7066, 2511, 287, 41...
4                        Put yourselves in my position.\n  ...     [218, 1310, 95, 138, 940, 56, 408, 1967, 9995]
...                                                   ...  ...                                                ...
206107  The Ancient Greeks didn't just wake up one day...  ...  [1273, 2742, 8731, 5195, 250, 104, 505, 2086, ...
206108  It took centuries, even, for humans to realize...  ...  [186, 278, 5417, 1624, 2

In [None]:
max_en_len=max(len(encoded) for encoded in train_df['en_encoded'])
max_de_len=max(len(encoded) for encoded in train_df['de_encoded'])
max_len=max(max_en_len,max_de_len)

In [None]:
for en_sentence, de_sentence in zip(train_df['en_encoded'],train_df['de_encoded']):
    while len(en_sentence) < max_len:
        en_sentence.append(0)
    while len(de_sentence) < max_len:
        de_sentence.append(0)

In [None]:
from torch.utils.data import Dataset,DataLoader
import torch


In [None]:
class CustomDataset(Dataset):
  def __init__(self,en,de):
    self.en=torch.tensor(en)
    self.de=torch.tensor(de)
  def __len__(self):
    return len(self.en)
  def __getitem__(self, index):
      return self.en[index], self.de[index]

In [None]:
train_dataset=CustomDataset(train_df['en_encoded'],train_df['de_encoded'])

In [None]:
train_dataloader=DataLoader(train_dataset,batch_size=16,shuffle=True)