In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import torch
import datasets
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-small"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset('findnitai/english-to-hinglish')

master = []
for line in dataset['train']['translation']:
    master.append(line['en'])
    master.append(line['hi_ng'])

def gen_training_data():
    return (master[i : i+500] for i in range(0, len(master), 500))

tokenizer_training_data = gen_training_data()
tokenizer = tokenizer.train_new_from_iterator(tokenizer_training_data, 32128) 




In [4]:
dataset['train']

Dataset({
    features: ['translation'],
    num_rows: 189102
})

In [5]:
source_prefix = "Translate English to Hinglish : "
source_lang = "en"
target_lang = "hi_ng"
max_source_length = 128 
max_target_length = 128 
padding = "max_length" 
num_epochs = 1

In [6]:
def preprocess(source_data):
    inputs = [sample[source_lang] for sample in source_data["translation"]]
    targets = [sample[target_lang] for sample in source_data["translation"]]
    inputs = [source_prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    
   
    labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
    
   
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [7]:
train_dataset = dataset["train"]
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns="translation")

  0%|          | 0/190 [00:00<?, ?ba/s]

In [8]:
from transformers import HfArgumentParser
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
trainer_args_in = {
    'output_dir': 't5-hinglish-translator',
    'overwrite_output_dir' : True,
    'do_train' : True,
   
    'per_device_train_batch_size' : 32,
    'num_train_epochs' : num_epochs,
    'report_to': 'none',
    'save_total_limit':1
}

parser = HfArgumentParser((Seq2SeqTrainingArguments))
training_args = parser.parse_dict(trainer_args_in)

trainer = Seq2SeqTrainer(model=model, args=training_args[0], train_dataset=train_dataset, tokenizer=tokenizer)

train_result = trainer.train(resume_from_checkpoint=None)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,5.5356
1000,4.634
1500,4.2494
2000,4.0077
2500,3.8236
3000,3.6997
3500,3.5945
4000,3.4893
4500,3.4637
5000,3.3722


In [11]:
device='cuda' if torch.cuda.is_available else 'cpu'
device

'cuda'

In [13]:
input_ids = tokenizer("translate English to Hinglish: what is your name brother?", return_tensors="pt").input_ids
outputs = model.generate(input_ids.to(device))
print("Test Output : " + tokenizer.decode(outputs[0], skip_special_tokens=True))

Test Output : your name ko hai
