In [1]:
from pathlib import Path
from typing import Literal
import pandas as pd
import torch
from torch import nn, optim
from en_indic_transformer import Transformer, Tokenizer, Trainer, TranslationDataLoader, TranslationDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

Create a various values to use for the rest of the notebook

Get the data from the data directory and create a dataframe

In [3]:
tokenizer = Tokenizer('gpt2', extend_base_encoder={'<|english|>','<|hindi|>', '<|kannada|>' }) # adding kannada for later
src_prepend_value = '<|english|>'
target_prepend_value = '<|hindi|>'

batch_size = 16
random_seed = 42 # for reproducibility
device: Literal['cpu', 'cuda'] = 'cuda' if torch.cuda.is_available() else 'cpu' # device for training.

# apply random_seed
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

# transformer details
context_length = 3000
vocab_size = tokenizer.n_vocab # since using gpt2 tokenizer
emb_dim = 512
enc_layers = 3
dec_layers = 3
num_heads = 16
dropout = 0.1
bias = False

# training details
epochs = 10
lr = 1e-5 # change.

# data
train_len = 100_000
test_len = 20_000

In [4]:
home_dir = Path().absolute().parent

In [5]:
data_dir = home_dir / 'data/eng_hindi.csv'

In [6]:
df = pd.read_csv(data_dir)

In [7]:
df

Unnamed: 0,english_sentence,hindi_sentence
0,"However, Paes, who was partnering Australia's ...",आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाल...
1,"Whosoever desires the reward of the world, wit...",और जो शख्स (अपने आमाल का) बदला दुनिया ही में च...
2,The value of insects in the biosphere is enorm...,"जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि ..."
3,Mithali To Anchor Indian Team Against Australi...,आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को
4,After the assent of the Honble President on 8t...,"8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍व..."
...,...,...
127700,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
127701,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127702,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
127703,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


There are 127705 rows in the dataset. Use train_len rows for training and remaining for validation. I am running on cpu. Will use gpu later.

In [8]:
train_df = df.iloc[:train_len,:]
# test_df = df.iloc[train_len: train_len + 50, :]
test_df = df.iloc[train_len: train_len + test_len, :]

In [9]:
train_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,"However, Paes, who was partnering Australia's ...",आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाल...
1,"Whosoever desires the reward of the world, wit...",और जो शख्स (अपने आमाल का) बदला दुनिया ही में च...
2,The value of insects in the biosphere is enorm...,"जैव-मंडल में कीड़ों का मूल्य बहुत है, क्योंकि ..."
3,Mithali To Anchor Indian Team Against Australi...,आस्ट्रेलिया के खिलाफ वनडे टीम की कमान मिताली को
4,After the assent of the Honble President on 8t...,"8 सितम्‍बर, 2016 को माननीय राष्‍ट्रपति की स्‍व..."


In [10]:
test_df.head()

Unnamed: 0,english_sentence,hindi_sentence
100000,and the amount of climate emissions,और उन मौसमी उत्सर्जनों के बीच
100001,"“ I told them , ' Never cover up for me , ' ” ...","जनरल कहते हैं , ' ' मैंने उनसे कहा , कभी मेरी ..."
100002,as much as the anger part of it.,जितना की उसके बाद का गुस्सा.
100003,Preventing Real Online Threats,प्रिवेंटिंग रियल ऑनलाइन थ्रेट्स
100004,"Besides, it is also believed that Man can not ...",साथ में यह भी माना जाता कि उसकी पूरी कल्पना मन...


Create lists of source and target sentences for training and validation sets

In [11]:
# train
source_train = train_df['english_sentence'].tolist()
target_train = train_df['hindi_sentence'].tolist()

# test
source_test = test_df['english_sentence'].tolist()
target_test = test_df['hindi_sentence'].tolist()

Create training and testing data loaders

In [12]:
# train dataset
train_dataset = TranslationDataset(src=source_train, target=target_train,tokenizer=tokenizer, src_prepend_value=src_prepend_value, target_prepend_value=target_prepend_value)

# test dataset
test_dataset = TranslationDataset(src=source_test, target=target_test,tokenizer=tokenizer, src_prepend_value=src_prepend_value, target_prepend_value=target_prepend_value)

In [13]:
# train dataloader
train_dataloader = TranslationDataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# test dataloader
test_dataloader = TranslationDataLoader(test_dataset, batch_size=batch_size, shuffle=True)

set aside a input for inference later on.

In [14]:
data = iter(test_dataloader)
sample_batch = next(data)

In [15]:
inputs = tokenizer.decode(sample_batch[0][0])
target = tokenizer.decode(sample_batch[1][0][:1]) # take the starting token for now.

inputs, target

('<|english|>It lasted throughout his life .<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 '<|hindi|>')

Create the model for training

In [16]:
torch.manual_seed(random_seed) # needed to get same weights for reproducibility
model = Transformer(vocab_size=vocab_size, context_length=context_length, emb_dim=emb_dim, enc_layers=enc_layers, dec_layers=dec_layers, num_heads=num_heads,dropout=dropout, bias=bias)
model.to(device)

Transformer(
  (encoder): Encoder(
    (token_embeddings): Embedding(50260, 512)
    (pos_embeddings): Embedding(3000, 512)
    (encoder_layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (mlp): MLP(
          (mlp): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): GELU(approximate='none')
            (2): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
        (attn): MultiHeadAttention(
          (wq): Linear(in_features=512, out_features=512, bias=False)
          (wk): Linear(in_features=512, out_features=512, bias=False)
          (wv): Linear(in_features=512, out_features=512, bias=False)
          (proj): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm()
        (norm2): LayerNorm()
      )
    )
  )
  (decoder): Decoder(
    (token_embeddings): Embedding(50260, 512)
    (pos_embeddings): Embedding(3

Create a optimizer and loss function

Using Adam optimizer here.

In [17]:
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

### Create the trainer instance for training the model

##### create a path to save model checkpoints

In [18]:
model_checkpoint_dir = home_dir / 'models'

In [19]:
model_checkpoint_dir.mkdir(parents=True, exist_ok=True)

In [20]:
trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer, tokenizer=tokenizer, save_path= model_checkpoint_dir / 'transformer.pt')



In [21]:
trainer.train(train_dataloader=train_dataloader, test_dataloader=test_dataloader, epochs=epochs, device=device, predict_input=inputs, predict_target=target)

  0%|          | 0/10 [00:00<?, ?it/s]

----- Epoch 0 -----


  3%|██                                                          | 217/6250 [00:29<13:37,  7.38it/s]
  0%|          | 0/10 [00:29<?, ?it/s]


KeyboardInterrupt: 