In [1]:
import sys
print("Python version:", sys.version)

Python version: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]


In [3]:
import tensorflow as tf
import keras
print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

TensorFlow version: 2.16.1
Keras version: 3.3.3


In [4]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs detected: {[gpu.name for gpu in gpus]}")
else:
    print("No GPU detected.")

GPUs detected: ['/physical_device:GPU:0', '/physical_device:GPU:1']


In [5]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install sacrebleu

  pid, fd = os.forkpty()


Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.0.0 sacrebleu-2.4.3


In [6]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

## Helsinki-NLP/opus-mt-en-hi modelsource: https://huggingface.co/Helsinki-NLP/opus-mt-en-hi

In [7]:
model_checkpoint = "/kaggle/input/modelkag1/modelkag1"

# OR you can load checkpoint locally

# The Dataset

Source: https://huggingface.co/datasets/momo22/eng2nep

In [8]:
raw_datasets = load_dataset("momo22/eng2nep")

README.md:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

train_data.csv:   0%|          | 0.00/615M [00:00<?, ?B/s]

validation_data.csv:   0%|          | 0.00/76.7M [00:00<?, ?B/s]

test_data.csv:   0%|          | 0.00/76.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1591270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/198909 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/198909 [00:00<?, ? examples/s]

In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 1591270
    })
    validation: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 198909
    })
    test: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 198909
    })
})

In [10]:
from datasets import DatasetDict

train_subset = raw_datasets['train'].select(range(int(1591270)))
validation_subset = raw_datasets['validation'].select(range(int(198909)))
test_subset = raw_datasets['test']

raw_datasets = DatasetDict({
    'train': train_subset,
    'validation': validation_subset,
    'test': test_subset
})

In [11]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 1591270
    })
    validation: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 198909
    })
    test: Dataset({
        features: ['English', 'Nepali'],
        num_rows: 198909
    })
})

In [12]:
raw_datasets['train'][10]

{'English': 'Sometimes when he is not working, he is counted on the old table to know how much he died after hanging on the shrimps sitting on the masterpiece.\n',
 'Nepali': 'कहिलेकाहीँ ऊ काम नभएको बेला जूठो टेबिलमा झुप्प (गुरुप्प परेर बसेका झिँगाहरूमाथि पोछा हानेपछि कति मरेछन् भन्ने जान्न गन्ती गरिहेर्ने गर्छ।\n'}

# Preprocessing the data

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [14]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [12110, 2, 90, 23, 19, 8800, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer(["Hello, this is a sentence!", "This is another sentence."])

{'input_ids': [[12110, 2, 90, 23, 19, 8800, 61, 0], [239, 23, 414, 8800, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [16]:
with tokenizer.as_target_tokenizer():
    print(tokenizer(["म धेरै ज्ञानी छु।"]))

{'input_ids': [[4095, 11529, 174, 40838, 22159, 27667, 40, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}




In [17]:
max_input_length = 128
max_target_length = 128

source_lang = "English"
target_lang = "Nepali"


def preprocess_function(examples):
    inputs = [ex for ex in examples["English"]]
    targets = [ex for ex in examples["Nepali"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [18]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[893, 2, 29032, 10, 987, 8, 4, 6758, 26312, 44, 20, 69, 0], [256, 2431, 27310, 52, 178, 42, 2107, 7, 2975, 61030, 121, 4, 23771, 3666, 80, 4915, 68, 4, 39870, 23771, 3666, 4599, 2, 74, 6335, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[17, 12605, 13, 4412, 34220, 260, 3752, 1953, 179, 2, 36459, 3353, 20196, 44, 1434, 61737, 0], [164, 7584, 3644, 3625, 197, 6213, 4807, 20429, 6213, 5928, 44, 59061, 6395, 44, 25592, 3899, 3549, 12718, 2936, 51, 3085, 25, 22, 147, 2, 141, 4015, 1059, 7584, 8111, 314, 26441, 174, 40, 0]]}

In [19]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1591270 [00:00<?, ? examples/s]

Map:   0%|          | 0/198909 [00:00<?, ? examples/s]

Map:   0%|          | 0/198909 [00:00<?, ? examples/s]

In [20]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /kaggle/input/modelkag1/modelkag1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [21]:
batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [23]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [24]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"], #change to train
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [25]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [26]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=1,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [27]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [28]:
with tf.device('/GPU:0'):
    model.fit(train_dataset, validation_data=validation_dataset, epochs=1)

Cause: for/else statement not yet supported


In [29]:
del model.config.__dict__["max_length"]
del model.config.__dict__["num_beams"]
del model.config.__dict__["bad_words_ids"]

model.save_pretrained("/kaggle/working/tf_model/")