In [1]:
from pathlib import Path
from typing import Literal
import pandas as pd
import torch
from torch import nn, optim
from en_indic_transformer import Transformer, Tokenizer, Trainer, TranslationDataLoader, TranslationDataset

In [2]:
torch.cuda.empty_cache()

Specify the home_dir for future use.

In [3]:
home_dir = Path().absolute().parent
home_dir

PosixPath('/Users/sameergururajmathad/eng-indic-transformer')

Create a various values to use for the rest of the notebook

In [4]:
# tokenizer = Tokenizer('gpt2' {'<|english|>','<|hindi|>', '<|kannada|>' }) # adding kannada for later
corpus_save_dir = home_dir / 'tokenizer'
tokenizer = Tokenizer(str(corpus_save_dir/'tokenizer.model')) # uses sentence-piece tokenizer
src_prepend_value = '<|english|>'
target_prepend_value = '<|hindi|>'

batch_size = 16
random_seed = 42 # for reproducibility
device: Literal['cpu', 'cuda'] = 'cuda' if torch.cuda.is_available() else 'cpu' # device for training.

# apply random_seed
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

# transformer details
context_length = 1024 # changed from 3000
vocab_size = tokenizer.n_vocab # since using gpt2 tokenizer
emb_dim = 512
enc_layers = 2
dec_layers = 2
num_heads = 16
dropout = 0.1
bias = False

# training details
epochs = 10
lr = 1e-5 # change.

# data
train_frac = 0.90

In [5]:
data_dir = home_dir / 'data'

In [6]:
en_hindi_csv = data_dir / 'en_hindi.csv'

In [7]:
en_hindi_df = pd.read_csv(en_hindi_csv)

In [8]:
en_hindi_df

Unnamed: 0,english_sentence,hindi_sentence
0,When it is said to him: 'Fear Allah' egotism t...,"और जब उससे कहा जाता है, ""अल्लाह से डर"", तो अहं..."
1,This profile exists already.,यह प्रोफ़ाइल पहले से ही है.
2,Halo with Ornamental Borde,विवरण: एक पारंपरिक कमल के फूल के साथ पत्थर की ...
3,and the jinn We had created before from flamin...,और हम ही ने जिन्नात को आदमी से (भी) पहले वे धु...
4,"Ladies and Gentlemen, the Government of India ...",शहरीकरण की तेज गति के साथ अवसंरचना और सेवाओं क...
...,...,...
1780681,Gaja cyclone in Tamil Nadu on 16.11.2018,तमिलनाडू में गजा चक्रवात - 16.11.2018
1780682,PRESIDENT OF INDIA APPOINTS GOVERNORS,भारत के राष्ट्रपति ने राज्यपालों की नियुक्ति की
1780683,is a phenomenon that 's been promised,"एक ऐसी घटना है, जिसकी संभावना दशकों तक"
1780684,Move waste to stock,बेकार को भण्डार में ले जाएँ


There are 1786788 rows in the dataset. Use train_len rows for training and remaining for validation. I am running on cpu. Will use gpu later.

In [9]:
train_len = int(train_frac * len(en_hindi_df))

In [10]:
train_df = en_hindi_df.iloc[:train_len,:]
test_df = en_hindi_df.iloc[train_len: :]

In [11]:
train_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,When it is said to him: 'Fear Allah' egotism t...,"और जब उससे कहा जाता है, ""अल्लाह से डर"", तो अहं..."
1,This profile exists already.,यह प्रोफ़ाइल पहले से ही है.
2,Halo with Ornamental Borde,विवरण: एक पारंपरिक कमल के फूल के साथ पत्थर की ...
3,and the jinn We had created before from flamin...,और हम ही ने जिन्नात को आदमी से (भी) पहले वे धु...
4,"Ladies and Gentlemen, the Government of India ...",शहरीकरण की तेज गति के साथ अवसंरचना और सेवाओं क...


In [12]:
test_df.head()

Unnamed: 0,english_sentence,hindi_sentence
1602617,I know you can 't read that.,मैं जानता हूँ कि तुम कि पढ़ा नहीं कर सकते।
1602618,& File name:,फ़ाइल नामः (F) cd track number
1602619,CLICK HERE to see the latest tender notice,नवीनतम निविदा सूचना देखने के लिए यहाँ क्लिक कर...
1602620,In this globalized and highly competitive worl...,इसीलिए वैश्वीकरण और स्पर्धा के इस दौर में भी ह...
1602621,"And if he is one of those on the right hand,","और यदि वह भाग्यशालियों में से है,"


Create lists of source and target sentences for training and validation sets

In [13]:
# train
source_train = train_df['english_sentence'].tolist()
target_train = train_df['hindi_sentence'].tolist()

# test
source_test = test_df['english_sentence'].tolist()
target_test = test_df['hindi_sentence'].tolist()

Create training and testing data loaders

In [14]:
# train dataset
train_dataset = TranslationDataset(src=source_train, target=target_train,tokenizer=tokenizer, src_prepend_value=src_prepend_value, target_prepend_value=target_prepend_value, max_length=context_length)

# test dataset
test_dataset = TranslationDataset(src=source_test, target=target_test,tokenizer=tokenizer, src_prepend_value=src_prepend_value, target_prepend_value=target_prepend_value, max_length=context_length)

In [15]:
pad_val = tokenizer.get_piece_id('<|endoftext|>')
ignore_index = -100
# train dataloader
train_dataloader = TranslationDataLoader(train_dataset, batch_size=batch_size, shuffle=True, pad_val=pad_val, ignore_index=ignore_index)

# test dataloader
test_dataloader = TranslationDataLoader(test_dataset, batch_size=batch_size, shuffle=True, pad_val=pad_val, ignore_index=ignore_index)

set aside a input for inference later on.

In [16]:
data = iter(test_dataloader)
sample_batch = next(data)

In [17]:
inputs = tokenizer.decode(sample_batch[0][0])
target = tokenizer.decode(sample_batch[1][0][:1]) # take the starting token for now.
actual_target = tokenizer.decode(sample_batch[1][0]) # take the starting token for now.

inputs, target, actual_target

('<|english|> socialisation of banks<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>',
 '<|hindi|>',
 '<|hindi|> बैंकों का समाजीकरण<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>')

Create the model for training

In [18]:
torch.manual_seed(random_seed) # needed to get same weights for reproducibility
model = Transformer(vocab_size=vocab_size, context_length=context_length, emb_dim=emb_dim, enc_layers=enc_layers, dec_layers=dec_layers, num_heads=num_heads,dropout=dropout, bias=bias)
model.to(device)

Transformer(
  (encoder): Encoder(
    (token_embeddings): Embedding(50000, 512)
    (pos_embeddings): Embedding(1024, 512)
    (encoder_layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (mlp): MLP(
          (mlp): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): GELU(approximate='none')
            (2): Linear(in_features=2048, out_features=512, bias=True)
          )
        )
        (attn): MultiHeadAttention(
          (wq): Linear(in_features=512, out_features=512, bias=False)
          (wk): Linear(in_features=512, out_features=512, bias=False)
          (wv): Linear(in_features=512, out_features=512, bias=False)
          (proj): Linear(in_features=512, out_features=512, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm()
        (norm2): LayerNorm()
      )
    )
  )
  (decoder): Decoder(
    (token_embeddings): Embedding(50000, 512)
    (pos_embeddings): Embedding(1

In [19]:
model.size

92549120

Create a optimizer and loss function

Using Adam optimizer here.

In [20]:
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

### Create the trainer instance for training the model

##### create a path to save model checkpoints

In [21]:
model_checkpoint_dir = home_dir / 'models'

In [22]:
model_checkpoint_dir.mkdir(parents=True, exist_ok=True)

In [23]:
trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer, tokenizer=tokenizer, save_path= model_checkpoint_dir / 'transformer.pt')



predict method will be invoked by the trainer after `batch_size_to_predict` batches are trained.

In [24]:
train_batches = len(train_dataloader)
batch_size_to_predict = int(train_batches * 0.1) # every 10% of the batches
batch_size_to_predict

10016

In [25]:
# trainer.train(train_dataloader=train_dataloader, test_dataloader=test_dataloader, epochs=epochs, device=device, predict_input=inputs, target_prefix=target, actual_target=actual_target, batch_size_to_predict=batch_size_to_predict)