In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/supervised-finetuned-weights/jax/default/1/lora_weights_epoch3.lora.h5
/kaggle/input/gemma-language-tuning/submission_instructions.txt
/kaggle/input/multilingual-text-corpus/multilingual_corpus_with_tags_reordered.txt
/kaggle/input/kaggleinputsupervised-finetuned-weightsjax/jax/default/1/lora_weights_epoch1.lora.h5
/kaggle/input/gemma2/keras/gemma2_instruct_2b_en/1/config.json
/kaggle/input/gemma2/keras/gemma2_instruct_2b_en/1/tokenizer.json
/kaggle/input/gemma2/keras/gemma2_instruct_2b_en/1/metadata.json
/kaggle/input/gemma2/keras/gemma2_instruct_2b_en/1/model.weights.h5
/kaggle/input/gemma2/keras/gemma2_instruct_2b_en/1/assets/tokenizer/vocabulary.spm


In [2]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp datasets
!pip install -q -U keras

import os

# Set the backbend before importing Keras
os.environ["KERAS_BACKEND"] = "jax"
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

import keras_nlp
import keras

# Run at half precision.
#keras.config.set_floatx("bfloat16")

# Training Configurations
token_limit = 1024
lora_name = "arya"
lora_rank = 4
lr_value = 1e-4
train_epoch = 20
model_id = "gemma2_instruct_2b_en"

In [3]:
import keras
import keras_nlp

import time

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(model_id)
gemma_lm.summary()

tick_start = 0

def tick():
    global tick_start
    tick_start = time.time()

def tock():
    print(f"TOTAL TIME ELAPSED: {time.time() - tick_start:.2f}s")

def text_gen(prompt):
    tick()
    input = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
    output = gemma_lm.generate(input, max_length=token_limit)
    print("\nGemma output:")
    print(output)
    tock()

# Running Intereference with the three languages before fine tuning

In [None]:
text_gen("వెళ్ళిపోతూ మళ్లీ వస్తానని అన్నాడు. కానీ, అతను తిరిగి రాలేదు. అతని గురించి ఏం అనిపిస్తోంది?")

In [None]:
text_gen("उसने कहा था कि वो लौटकर आएगा, लेकिन वो वापस नहीं आया। उसके बारे में आपको क्या लगता है?")

In [None]:
text_gen("उहाँले फर्किन्छु भन्नु भयो, तर उहाँ फर्किनु भएन। तपाईंलाई उहाँबारे के लाग्छ?")

In [8]:
import keras
import keras_nlp
from datasets import load_dataset

# Load Gemma tokenizer
model_id = "gemma2_instruct_2b_en"
tokenizer = keras_nlp.models.GemmaTokenizer.from_preset(model_id)

# Configuration
token_limit = 256  # Maximum token length
num_data_limit = 1000  # Limit on the number of examples to process

# Language tags mapping
language_tags = {
     "san":"sanskrit",
    "tel":"telugu",
    "hin":"hindi",
    "npi":"nepali"
}

# Load dataset
dataset_path = "/kaggle/input/multilingual-text-corpus/multilingual_corpus_with_tags_reordered.txt"
raw_dataset = load_dataset("text", data_files={"train": dataset_path})

# Prepare dataset for fine-tuning
train_data = []

# Loop through the dataset and tokenize
for example in raw_dataset["train"]:
    text = example["text"]
    
    # Extract the language tag (example assumes the language is in the first part of the text)
    # Example: "<tel> This is a Telugu sentence."
    language = text.split(">")[0][1:]  # Extract "tel" from "<tel>"
    tag = language_tags.get(language, "<unk>")  # Use <unk> for unknown languages
    #print(language)
    # Add language tag explicitly
    tagged_text = f"{tag} {text}"

    # Tokenize the text
    tokenized = tokenizer.tokenize(tagged_text)  # Tokenize the tagged text
    token_length = len(tokenized)  # Get the length of the tokenized sequence
    
    # Filter long sequences and add to training data
    if token_length < token_limit:
        train_data.append(tagged_text)
    


# Output dataset stats and examples
print(f"Number of training examples: {len(train_data)}")
print(f"First example:\n{train_data[0]}")
print(f"Second example:\n{train_data[1]}")


Number of training examples: 97573
First example:
sanskrit <san> स्वदेहे चेल्लिखितवान्वचनमनवगम्यं तेनाप्यचिन्तिते फल उपलब्धे किं कर्तव्यम् ?
Second example:
sanskrit <san> तत्त्वमसि ।


In [10]:
# Enable LoRA (Low-Rank Adaptation)
lora_rank = 4  # LoRA rank
gemma_lm.backbone.enable_lora(rank=lora_rank)
gemma_lm.preprocessor.sequence_length = token_limit

# Configure the optimizer
optimizer = keras.optimizers.AdamW(
    learning_rate=lr_value,
    weight_decay=0.01,
)
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)


In [11]:
class SaveLoRAWeightsCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        lora_weights_path = f"/kaggle/working/lora_weights_epoch{epoch + 1}.lora.h5"
        gemma_lm.backbone.save_lora_weights(lora_weights_path)
        print(f"Saved LoRA weights to: {lora_weights_path}")


In [None]:
# Fine-tune the model
epochs = 10
batch_size = 8

gemma_lm.fit(
    train_data,  # The tokenized dataset
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[SaveLoRAWeightsCallback()],
)


In [1]:

import keras_nlp
import keras

# Mount Google Drive


# Load Gemma 2 model
model_id = "gemma2_instruct_2b_en"
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(model_id)

# Enable LoRA and load weights
lora_rank = 4
gemma_lm.backbone.enable_lora(rank=lora_rank)
lora_weights_path = "/kaggle/input/kaggleinputsupervised-finetuned-weightsjax/jax/default/1/lora_weights_epoch3.lora.h5"
gemma_lm.backbone.load_lora_weights(lora_weights_path)

print("LoRA weights loaded successfully.")
gemma_lm.summary()


LoRA weights loaded successfully.


In [None]:
import json

# Load translation dataset
json_path = "/content/drive/MyDrive/Translations_Multilingual.json"
with open(json_path, "r", encoding="utf-8") as f:
    translation_data = json.load(f)

# Configuration
token_limit = 1024
train_data = []

# Prepare data for fine-tuning
for example in translation_data[0]:  # Assuming the JSON is a list of dictionaries
    prompt = example["prompt"]
    response = example["response"]
    #print(f"Prompt: {prompt}")

    # Prepare input-output text format for supervised learning
    input_text = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n{response}<end_of_turn>"

    # Tokenize the text using the preprocessor
    tokenized = gemma_lm.preprocessor(input_text)  # Returns a tuple

    # Extract token_ids and attention mask
    token_ids = tokenized[0]["token_ids"]
    #print(f"Token IDs: {token_ids.numpy()}")
    #print(len(token_ids))
    # Filter long sequences based on token length
    if len(token_ids) <= token_limit:
        train_data.append(input_text)

# Display dataset stats
print(f"Number of training examples: {len(train_data)}")
if len(train_data) > 0:
    print(f"First example:\n{train_data[0]}")


In [None]:
import json

# Save tokenized training data
tokenized_data_path = "/content/drive/MyDrive/Tokenized_Translations.json"
with open(tokenized_data_path, "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

print(f"Tokenized data saved to: {tokenized_data_path}")

In [2]:
import json
tokenized_data_path = "/content/drive/MyDrive/Tokenized_Translations.json"
with open(tokenized_data_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

print(f"Tokenized data loaded successfully. Number of examples: {len(train_data)}")


In [12]:
token_limit=1024
text_gen("Translate to Hindi: The sun rises in the east.")


Gemma output:
<start_of_turn>user
Translate to telugu: The sun rises in the east.<end_of_turn>
<start_of_turn>model
सूरज पूर्व में उगता है।<end_of_turn>
TOTAL TIME ELAPSED: 33.79s


In [11]:
text_gen("Tell a story in hindi")


Gemma output:
<start_of_turn>user
Tell a story in hindi<end_of_turn>
<start_of_turn>model
एक दिन, एक बूढ़ा मनुष्य, जो अपने जीवन में बहुत ही दुखी था, ने एक बूढ़ी बिल्ली को देखा, जो अपने बच्चे को खाने के लिये चिल्ला रही थी। उस मनुष्य ने बिल्ली को समझाया, कि तू अपने बच्चे को खाने के लिये चिल्लाती है, पर वह तुझे खाने के लिये नहीं चिल्लाती। बिल्ली ने उस मनुष्य को उत्तर दिया, कि मैं तुझे खाने के लिये नहीं चिल्लाती, पर मैं तुझे अपने बच्चे को खाने के लिये चिल्लाती हूं।<end_of_turn>
TOTAL TIME ELAPSED: 4.93s
