# Checking GPU Details

In [None]:
!nvidia-smi

Tue May 10 07:40:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Pre-processing the Data

### Installing and importing the necessary packages and functions

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.81-py3-none-any.whl (40 kB)
[?25l[K     |████████▏                       | 10 kB 18.4 MB/s eta 0:00:01[K     |████████████████▍               | 20 kB 11.5 MB/s eta 0:00:01[K     |████████████████████████▌       | 30 kB 8.5 MB/s eta 0:00:01[K     |████████████████████████████████| 40 kB 3.1 MB/s 
Collecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-1.0.0-py2.py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 8.7 MB/s 
[?25hCollecting sphinx-argparse
  Downloading sphinx_argparse-0.3.1-py2.py3-none-any.whl (12 kB)
Installing collected packages: sphinx-rtd-theme, sphinx-argparse, morfessor, indic-nlp-library
Successfully installed indic-nlp-library-0.81 morfessor-2.0.6 sphinx-argparse-0.3.1 sphinx-rtd-theme-1.0.0


In [None]:
from indicnlp.normalize.indic_normalize import DevanagariNormalizer

In [None]:
pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [None]:
import jsonlines

### Getting the location of the datasets from Google Drive

In [None]:
path = '/content/drive/MyDrive/MTech_Project/MT_Datasets/'

### Creating a function for pre-processing both the datasets

In [None]:
def parse_jsonl(file_jsonl, data_id):
    normalizer = DevanagariNormalizer()
    en_data = []
    hi_data = []
    with jsonlines.open(file_jsonl) as reader:
        for obj in reader:
            hi_data.append(' '.join([(x[1] if x[2] == 'hi' else x[1]) for x in obj['Hinglish']]) + '\n')
            en_data.append(' '.join(obj['English']) + '\n')
    for i in range(len(hi_data)):
        hi_data[i] = normalizer.normalize(hi_data[i])
    assert len(en_data) == len(hi_data)
    print(f'total size of {data_id} data is {len(en_data)}')
    return en_data, hi_data

### Creating a function for splitting into train, validation and test

In [None]:
def train_dev_test_split(en_data, hi_data, dev_size, test_size):
    total_test = dev_size + test_size
    en_train, en_subtotal, hi_train, hi_subtotal = train_test_split(en_data, hi_data, test_size=total_test, random_state=42)
    en_val, en_test, hi_val, hi_test = train_test_split(en_subtotal, hi_subtotal, test_size=test_size, random_state=42)
    return en_train, hi_train, en_val, hi_val, en_test, hi_test

### Pre-processing and splitting both the datasets into train, validation and test (seperately)

In [None]:
dh_en_train, dh_hi_train, dh_en_val, dh_hi_val, dh_en_test, dh_hi_test = train_dev_test_split(*parse_jsonl(path+'mrinal_dhar.jsonl', 'DHAR'), 604, 604)
ph_en_train, ph_hi_train, ph_en_val, ph_hi_val, ph_en_test, ph_hi_test = train_dev_test_split(*parse_jsonl(path+'phinc.jsonl', 'PHINC'), 1374, 1374)

total size of DHAR data is 6041
total size of PHINC data is 13738


### Some examples from both the datasets

In [None]:
dh_en_train[:5]

["swaer to god salman khan if you do not talk to us then after today I will never see even one of your films why don't you come\n",
 'Please help me , bro .\n',
 'Please send the answer if you get them .\n',
 'I love love u love u love u love u so mach salman khan just once i want to say i love you face to face to you\n',
 'Salman brother . . My big brother .. I dont like anyone more than you . .. I consider you as my big brother ...\n']

In [None]:
dh_hi_train[:5]

['अल्लाह कसम salman khan अगर but नई आप ने हम से तो आज के बाद में खूबी आपकी एक film भी नई देखूंगा आते कि नै हो\n',
 'help कर दो मेरी pulses भाई\n',
 'please answers मिलें तो भेज देना .\n',
 'I love love u love u love u love u so match salman khan एक बार मुझे आपको i love you face तो पैक कहना है\n',
 'Salman भाईजान . . मेरे बड़े भाई .. मुझे आपसे अच्छा कोई नहीं लगता . .. मैं आपको अपना बड़ा भाई मानता हूँ ...\n']

In [None]:
ph_en_train[:5]

["RT @tushardave15 : Please follow @kuldpvys , his tweets about life are amazing and proceless . Look at some of the previous RT's . \x8f\n",
 '@narendramodi @arunjaitley if you would have filed GSTR 1 , then you would have understand how much good and simple gst is .\n',
 "you have had the phone for one week , you'd have put 20-25 themes on already\n",
 'Chitral : Eleven people buriend due to fall of an Iceberg in Susoom , corpses of two children were recovered , landsliding in independent kashmir destroyed hundreds of homes .\n',
 'Mind and heart should always be open .\n']

In [None]:
ph_hi_train[:5]

["RT @tushardave15 : Please follow @kuldpvys जिंदगी पर इसके tweets बहुत शानदार और लाजवाब होते हैं . पिछले कुछ RT'S देखें . ðŸ ' \x8f ðŸ ' \x8f ðŸ ' \x8f\n",
 '@narendramodi @arunjaitley अगर आप लोगों ने सिर्फ नील की GSTR 1 file की होती न तो आप खुद समझ जाते ghost कितना good and simple है . Disaster #gst\n',
 'तेरे पास एक week से phone आया है , तूने 20-25 themes तो डाल दी है\n',
 'Chitral : Susoom में बर्फानी टोडा गिरने से 11 अफ़राद दब गए , 2 बच्चों की लाशें बरामद + Azad Kashmir में Landsliding से सैंकड़ों घर तबाह\n',
 'दिल और दमाग हमेशा खुली होनी चाहिए .\n']

### Combining the train, validation and test of both datasets into one train, validation and test

In [None]:
train_inputs = dh_hi_train + ph_hi_train
train_targets = dh_en_train + ph_en_train
val_inputs = dh_hi_val + ph_hi_val
val_targets = dh_en_val + ph_en_val
test_dh_inputs = dh_hi_test
test_ph_inputs = ph_hi_test
test_dh_targets = dh_en_test
test_ph_targets = ph_en_test

### Saving the train, validation and test to .csv files

In [None]:
#import pandas as pd

In [None]:
# train

In [None]:
#dict_train = {'hi': train_inputs, 'en': train_targets}

In [None]:
#df_train = pd.DataFrame(dict_train)

In [None]:
#df_train.head(10)

Unnamed: 0,hi,en
0,अल्लाह कसम salman khan अगर but नई आप ने हम से ...,swaer to god salman khan if you do not talk to...
1,help कर दो मेरी pulses भाई\n,"Please help me , bro .\n"
2,please answers मिलें तो भेज देना .\n,Please send the answer if you get them .\n
3,I love love u love u love u love u so match sa...,I love love u love u love u love u so mach sal...
4,Salman भाईजान . . मेरे बड़े भाई .. मुझे आपसे अ...,Salman brother . . My big brother .. I dont li...
5,तो sisters और brothers बहुत प्यार करते ?\n,Do your sisters and brothers love each other ?\n
6,आप को letter भी लिखे हैं शायद आप तक पहुँच जाये...,I have also written letters to you ... hopeful...
7,"भाई भाई somabhai bohemia ,\n",brother brother somabhai bohemia\n
8,हाँ दीदी\n,Yes sister\n
9,उसका suit ले कर जाना है तो\n,have to take her suit\n


In [None]:
#df_train.to_csv('train.csv', index=False)

In [None]:
# val

In [None]:
#dict_val = {'hi': val_inputs, 'en': val_targets}

In [None]:
#df_val = pd.DataFrame(dict_val)

In [None]:
#df_val.head()

Unnamed: 0,hi,en
0,भाई जान जब आवोगे मुझे reply करना plus am waiti...,brother when you come please reply to me am wa...
1,हाँ but so many only Hindi sentences हैं\n,yes but there are so many only hindi sentences\n
2,मेरे साथ तो 30 लोग type जा रहे हैं .\n,With me 30 type people are going\n
3,क्या हवा है और सिया breeze भाई !\n,What wind and sea breeze\n
4,V nice comment शालू भाई\n,V nice commet salu brother\n


In [None]:
#df_val.to_csv('valid.csv', index=False)

In [None]:
# test_dh

In [None]:
#dict_test_dh = {'hi': test_dh_inputs, 'en': test_dh_targets}

In [None]:
#df_test_dh = pd.DataFrame(dict_test_dh)

In [None]:
#df_test_dh.head()

Unnamed: 0,hi,en
0,I show your movie trailor i like your al movie...,I saw your movie trailer . I like all your mov...
1,क्या सच में salman khan हो मुझे जवाब दीजिये\n,"Are you really salman khan , please answer me .\n"
2,मैं मोटी हो गई न\n,"I became fat , yes ?\n"
3,Tamil तेरी याद्ध नहीं .. हर पल तेरी याद्ध याद्...,you don't rememeber Tamil .. every moment your...
4,Attendance sheet में तुम्हारा नाम भी है .\n,There is your name also in the presence sheet .\n


In [None]:
#df_test_dh.to_csv('test_dh.csv', index=False)

In [None]:
# test_ph

In [None]:
#dict_test_ph = {'hi': test_ph_inputs, 'en': test_ph_targets}

In [None]:
#df_test_ph = pd.DataFrame(dict_test_ph)

In [None]:
#df_test_ph.head()

Unnamed: 0,hi,en
0,@Jagat___Janani बस आपने इतना सोच हमारे बारे मे...,@Jagat___Janani Even if you think about me thi...
1,@RootKanal उसके बारे में कुछ नहीं बोलने का\n,@Rutknall to say nothing about it\n
2,"@someUSER if you happen to come online , know ...","@someUSER if you happen to come online , know ..."
3,salman khan you are my god father ... अगर आप न...,salman khan you are my godfather ... without y...
4,@dhaval241086 अभी गलती से block button दब गया ...,@dhaval241086 I clicked on the block button by...


In [None]:
#df_test_ph.to_csv('test_ph.csv', index=False)

# Using Hugging Face Datasets and Transformers Library

### Installing the necessary packages

In [None]:
! pip install datasets transformers sacrebleu

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 37.7 MB/s eta 0:00:01[K     |██                              | 20 kB 41.0 MB/s eta 0:00:01[K     |███                             | 30 kB 24.5 MB/s eta 0:00:01[K     |████                            | 40 kB 13.2 MB/s eta 0:00:01[K     |█████                           | 51 kB 12.6 MB/s eta 0:00:01[K     |██████                          | 61 kB 14.6 MB/s eta 0:00:01[K     |███████                         | 71 kB 14.1 MB/s eta 0:00:01[K     |████████                        | 81 kB 12.0 MB/s eta 0:00:01[K     |█████████                       | 92 kB 13.2 MB/s eta 0:00:01[K     |██████████                      | 102 kB 14.2 MB/s eta 0:00:01[K     |███████████                     | 112 kB 14.2 MB/s eta 0:00:01[K     |████████████                    | 122 kB 14.2 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 14.2 MB/s eta

### Logging into the Hugging Face Platform

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


### Importing Transformers Library

In [None]:
import transformers

print(transformers.__version__)

4.18.0


### Calling the mBART model checkpoint (as per the model's name on Hugging Face)

In [None]:
model_checkpoint = "facebook/mbart-large-cc25"

### Loading dataset and metric from the datasets library

In [None]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("rahulacj/dhar_phinc", use_auth_token=True)
metric = load_metric("sacrebleu")

Using custom data configuration rahulacj--dhar_phinc-75519d656a5564c8


Downloading and preparing dataset csv/rahulacj--dhar_phinc to /root/.cache/huggingface/datasets/csv/rahulacj--dhar_phinc-75519d656a5564c8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/95.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/286k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/380k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/rahulacj--dhar_phinc-75519d656a5564c8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['hi', 'en'],
        num_rows: 15823
    })
    test: Dataset({
        features: ['hi', 'en'],
        num_rows: 1978
    })
    validation: Dataset({
        features: ['hi', 'en'],
        num_rows: 1978
    })
})

In [None]:
raw_datasets["train"][0]

{'en': "swaer to god salman khan if you do not talk to us then after today I will never see even one of your films why don't you come\n",
 'hi': 'अल्लाह कसम salman khan अगर but नई आप ने हम से तो आज के बाद में खूबी आपकी एक film भी नई देखूंगा आते कि नै हो\n'}

In [None]:
raw_datasets["test"][0]

{'en': 'I saw your movie trailer . I like all your movies .\n',
 'hi': 'I show your movie trailor i like your al movie .\n'}

In [None]:
raw_datasets["validation"][0]

{'en': 'brother when you come please reply to me am waiting \\\n',
 'hi': 'भाई जान जब आवोगे मुझे reply करना plus am waiting\n'}

### Understanding the metric

In [None]:
fake_preds = ["Please help me , bro .\n"]
fake_labels = [["Please help me , bro .\n"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'bp': 1.0,
 'counts': [6, 5, 4, 3],
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'ref_len': 6,
 'score': 100.00000000000004,
 'sys_len': 6,
 'totals': [6, 5, 4, 3]}

In [None]:
fake_preds = ["Please help me , bro .\n"]
fake_labels = [["Can you please help me .\n"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'bp': 1.0,
 'counts': [3, 1, 0, 0],
 'precisions': [50.0, 20.0, 12.5, 8.333333333333334],
 'ref_len': 6,
 'score': 17.965205598154213,
 'sys_len': 6,
 'totals': [6, 5, 4, 3]}

### Importing AutoTokenizer from Transformers and calling mBART's tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

### Setting the source and target languages of the tokenizer

In [None]:
if "mbart" in model_checkpoint:
    tokenizer.src_lang = "hi_IN"
    tokenizer.tgt_lang = "en_XX"

### Understanding the tokenizer

In [None]:
tokenizer("Please help me , bro .\n")

{'input_ids': [30607, 4358, 163, 6, 4, 7155, 6, 5, 2, 250010], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens([30607, 4358, 163, 6, 4, 7155, 6, 5, 2, 250010])

['▁Please', '▁help', '▁me', '▁', ',', '▁bro', '▁', '.', '</s>', 'hi_IN']

In [None]:
tokenizer("help कर दो मेरी pulses भाई\n")

{'input_ids': [4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250010], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens([4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250010])

['▁help', '▁कर', '▁दो', '▁मेरी', '▁puls', 'es', '▁भाई', '</s>', 'hi_IN']

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["help कर दो मेरी pulses भाई\n"]))

{'input_ids': [[4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250004]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
tokenizer.convert_ids_to_tokens([4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250004])

['▁help', '▁कर', '▁दो', '▁मेरी', '▁puls', 'es', '▁भाई', '</s>', 'en_XX']

In [None]:
tokenizer("help कर दो मेरी pulses भाई")

{'input_ids': [4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250010], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens([4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250010])

['▁help', '▁कर', '▁दो', '▁मेरी', '▁puls', 'es', '▁भाई', '</s>', 'hi_IN']

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Please help me , bro .\n"]))

{'input_ids': [[30607, 4358, 163, 6, 4, 7155, 6, 5, 2, 250004]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [None]:
tokenizer.convert_ids_to_tokens([30607, 4358, 163, 6, 4, 7155, 6, 5, 2, 250004])

['▁Please', '▁help', '▁me', '▁', ',', '▁bro', '▁', '.', '</s>', 'en_XX']

### Setting the max input and target length

In [None]:
max_input_length = 128
max_target_length = 128

### Creating a function to tokenize the dataset

In [None]:
source_lang = "hi"
target_lang = "en"

In [None]:
def preprocess_function(examples):
    inp = [j  for i,j in examples.items()  if i==source_lang]
    out = [j  for i,j in examples.items()  if i==target_lang]
    inputs = inp[0]
    targets = out[0]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[2139, 54693, 3849, 69671, 1920, 1552, 669, 70144, 24050, 1284, 30124, 4322, 1142, 10215, 646, 2073, 3264, 287, 6435, 421, 165340, 659, 44238, 967, 1346, 1780, 30124, 15392, 196426, 126658, 1682, 6425, 1253, 2, 250010], [4358, 1896, 10850, 31500, 55111, 90, 53761, 2, 250010]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[68062, 56, 47, 2355, 1552, 669, 70144, 2174, 398, 54, 959, 22120, 47, 1821, 7068, 7103, 18925, 87, 1221, 8306, 1957, 3853, 1632, 111, 935, 54180, 15400, 2301, 25, 18, 398, 1380, 2, 250004], [30607, 4358, 163, 6, 4, 7155, 6, 5, 2, 250004]]}

### Mapping the tokenize function to the dataset

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

### Downloading the mBART model from Hugging Face and importing other important functions

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

### Passing the Training Arguments

In [None]:
batch_size = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}-v2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    weight_decay=0.01,
    save_total_limit=3 ,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='bleu',
    save_strategy = 'epoch',
    push_to_hub=True,
)

### Calling the DataCollator function for padding

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Defining a function to compute the metric

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Passing the arguments to the Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en-v2 into local empty directory.
Using amp half precision backend


### Training

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: hi, en. If hi, en are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15823
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 39550


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,1.8971,1.60145,19.3557,43.7594
1,1.3266,1.491666,19.1404,35.3155
2,0.9906,1.53543,26.999,26.7497
3,0.6987,1.64571,31.9572,23.4565
4,0.5073,1.854372,34.1169,22.1507
5,0.3554,2.098537,34.0746,22.2396
6,0.2423,2.253406,33.2205,22.2184
7,0.1918,2.401406,32.2001,22.635
8,0.1423,2.506733,32.4074,22.8716
9,0.1105,2.561783,33.1965,22.5905


The following columns in the evaluation set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: hi, en. If hi, en are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1978
  Batch size = 1
Saving model checkpoint to mbart-large-cc25-finetuned-hi-to-en-v2/checkpoint-3955
Configuration saved in mbart-large-cc25-finetuned-hi-to-en-v2/checkpoint-3955/config.json
Model weights saved in mbart-large-cc25-finetuned-hi-to-en-v2/checkpoint-3955/pytorch_model.bin
tokenizer config file saved in mbart-large-cc25-finetuned-hi-to-en-v2/checkpoint-3955/tokenizer_config.json
Special tokens file saved in mbart-large-cc25-finetuned-hi-to-en-v2/checkpoint-3955/special_tokens_map.json
tokenizer config file saved in mbart-large-cc25-finetuned-hi-to-en-v2/tokenizer_config.json
Special tokens file saved in mbart-large-cc25-finetuned-hi-to-en-v2/special_tokens_map.json

TrainOutput(global_step=39550, training_loss=0.7448179870730857, metrics={'train_runtime': 53967.1542, 'train_samples_per_second': 2.932, 'train_steps_per_second': 0.733, 'total_flos': 7741635130392576.0, 'train_loss': 0.7448179870730857, 'epoch': 10.0})

In [None]:
print_gpu_utilization()

GPU memory occupied: 14106 MB.


In [None]:
!nvidia-smi

Tue May 10 22:44:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    33W /  70W |  14106MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Evaluating Test

In [None]:
trainer.evaluate(tokenized_datasets["test"])

The following columns in the evaluation set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: hi, en. If hi, en are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1978
  Batch size = 1


{'epoch': 10.0,
 'eval_bleu': 33.4814,
 'eval_gen_len': 21.8974,
 'eval_loss': 1.802693486213684,
 'eval_runtime': 1216.383,
 'eval_samples_per_second': 1.626,
 'eval_steps_per_second': 1.626}

In [None]:
!nvidia-smi

Tue May 10 23:06:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    33W /  70W |  14106MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Uploading the model to the Hugging Face Platform

In [None]:
trainer.push_to_hub()

Saving model checkpoint to mbart-large-cc25-finetuned-hi-to-en-v2
Configuration saved in mbart-large-cc25-finetuned-hi-to-en-v2/config.json
Model weights saved in mbart-large-cc25-finetuned-hi-to-en-v2/pytorch_model.bin
tokenizer config file saved in mbart-large-cc25-finetuned-hi-to-en-v2/tokenizer_config.json
Special tokens file saved in mbart-large-cc25-finetuned-hi-to-en-v2/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.33k/2.28G [00:00<?, ?B/s]

Upload file runs/May10_07-42-37_40a3ec82fc7e/events.out.tfevents.1652168597.40a3ec82fc7e.81.0:  16%|#6        …

Upload file runs/May10_07-42-37_40a3ec82fc7e/events.out.tfevents.1652223886.40a3ec82fc7e.81.2: 100%|##########…

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en-v2
   3644f08..d0f608b  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Bleu', 'type': 'bleu', 'value': 33.4814}]}
remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en-v2
   d0f608b..6a8e23b  main -> main



'https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en-v2/commit/d0f608befaec141ba695efe097b9d0f5f4fd0f79'

### Examples of translation using the new model

In [None]:
my_model = AutoModelForSeq2SeqLM.from_pretrained("rahulacj/mbart-large-cc25-finetuned-hi-to-en")

loading configuration file https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/9b5055d6f13ee25900be0c6338b4094f0d29abce75e5b06ede564ea79553f995.b61b2ad9a69e42b4b5f1b266588f5877aaec080774d64402837228d9c1f84ff8
Model config MBartConfig {
  "_name_or_path": "rahulacj/mbart-large-cc25-finetuned-hi-to-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "MBartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_

In [None]:
my_tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

loading file https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en/resolve/main/sentencepiece.bpe.model from cache at /root/.cache/huggingface/transformers/0edad80139bcfcc4e69b5e30da7c955b931f56e6e7ff6c34381a513a2a1bffb2.71e50b08dbe7e5375398e165096cacc3d2086119d6a449364490da6908de655e
loading file https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/92730dfcec2e91af8a7c1e91fed8259e6b356b6e3785199f6902a026fa25b2c2.75faf4f2f00f82207db05521df67e4848ad7dac3ce23fbae0d6fafd8abe21e15
loading file https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/rahulacj/mbart-large-cc25-finetuned-hi-to-en/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/a1e704c77f0c13761ad705580d2635928394c2fc094edea4decde1e46b77c972.19324b0109f111d6321f43a8801ad3ecffd1d9baccc

In [None]:
if "mbart" in model_ckpt:
    my_tokenizer.src_lang = "hi_IN"

In [None]:
article_hi = "तेरे पास एक week से phone आया है , तूने 20-25 themes तो डाल दी है"

In [None]:
encoded_hi = my_tokenizer(article_hi, return_tensors="pt")

In [None]:
generated_tokens = my_model.generate(**encoded_hi, forced_bos_token_id=my_tokenizer.lang_code_to_id["en_XX"])

In [None]:
my_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['You have received a phone from a week, you have put 20-25 themes.']

In [None]:
# another example

In [None]:
article_hi_1 = "आज उसने बताया होगा कोर्स का layout"

In [None]:
encoded_hi_1 = my_tokenizer(article_hi_1, return_tensors="pt")

In [None]:
generated_tokens_1 = my_model.generate(**encoded_hi_1, forced_bos_token_id=my_tokenizer.lang_code_to_id["en_XX"])

In [None]:
my_tokenizer.batch_decode(generated_tokens_1, skip_special_tokens=True)

['Today he will tell the layout of the classes']