In [4]:
!pip install transformers




In [5]:
!nvidia-smi

Thu Jun 13 16:37:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
from transformers import pipeline

model_checkpoint="Helsinki-NLP/opus-mt-en-hi"
translator = pipeline("translation" , model = model_checkpoint)
translator("How are you?")

[{'translation_text': 'आप कैसे हैं?'}]

In [7]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(" I loved star war so much")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9995928406715393}]

In [8]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [1]:
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers  accelerate

Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Successfully uninstalled transformers-4.41.2
Found existing installation: accelerate 0.31.0
Uninstalling accelerate-0.31.0:
  Successfully uninstalled accelerate-0.31.0
Collecting transformers
  Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-0.31.0 transformers-4.41.2


## Huggingface model and data


In [2]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay


In [3]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

In [4]:
raw_datasets = load_dataset("cfilt/iitb-english-hindi")


#### from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#### tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

#### model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

In [5]:
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

In [6]:
raw_datasets['train'][1]


{'translation': {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'}}

In [7]:
!pip install sacremoses



In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [9]:
tokenizer("Dasari, this is a sentense")

{'input_ids': [16826, 16, 15027, 2, 90, 23, 19, 374, 11271, 393, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
with tokenizer.as_target_tokenizer():
  print(tokenizer(["अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें"]))

{'input_ids': [[63, 2025, 18, 16155, 346, 20311, 24, 2279, 679, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}




In [11]:
max_input_length =128
max_target_length =128

source_lang = "en"
target_lang = "hi"

def preprocess_function(examples):
  inputs  = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[target_lang] for ex in examples["translation"]]
  model_inputs = tokenizer(inputs , max_length = max_input_length , truncation = True)

  #setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels=tokenizer(targets, max_length=max_target_length, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [12]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [13]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tf_model.h5:   0%|          | 0.00/306M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [14]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer , model=model , return_tensors = "tf")


In [19]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer , model=model , return_tensors = "tf", pad_to_multiple_of=128)

In [31]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=generation_data_collator
)

In [39]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=generation_data_collator
)

In [41]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator
)

In [44]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [45]:
model.fit(train_dataset, validation_data = validation_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x78a421763d30>

# inferencing

In [46]:
model.save_pretrained("tf_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [53]:
input_text = "I am learning coding.  How are you?"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor([[61949    73   280    51    22     0 61949 61949]], shape=(1, 8), dtype=int32)


In [59]:
with tokenizer.as_target_tokenizer():
  print(tokenizer.decode(out[0] , skip_special_tokens=True))

तुम कैसे हो?
