### Set environment variables

Set environment variables for `KAGGLE_USERNAME` and `KAGGLE_KEY`.

In [1]:
import os
from google.colab import userdata

# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.

os.environ["KAGGLE_USERNAME"] = userdata.get('username')
os.environ["KAGGLE_KEY"] = userdata.get('key')

**Install dependencies**

In [2]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -U keras

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/465.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/465.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m460.8/465.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.2/465.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
Collecting keras
  Downloading keras-3.0.5-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras
  Attempting uninstall: ker

**Select a backend**

In [3]:
os.environ["KERAS_BACKEND"] = "jax" # Or "torch" or "tensorflow"
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

**Import Packages**

In [4]:
import keras
import keras_nlp

**Load Dataset**

In [5]:
!wget -O databricks-dolly-15k.jsonl https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl

--2024-02-22 04:54:11--  https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
Resolving huggingface.co (huggingface.co)... 13.35.7.5, 13.35.7.57, 13.35.7.81, ...
Connecting to huggingface.co (huggingface.co)|13.35.7.5|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/34/ac/34ac588cc580830664f592597bb6d19d61639eca33dc2d6bb0b6d833f7bfd552/2df9083338b4abd6bceb5635764dab5d833b393b55759dffb0959b6fcbf794ec?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27databricks-dolly-15k.jsonl%3B+filename%3D%22databricks-dolly-15k.jsonl%22%3B&Expires=1708834136&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwODgzNDEzNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8zNC9hYy8zNGFjNTg4Y2M1ODA4MzA2NjRmNTkyNTk3YmI2ZDE5ZDYxNjM5ZWNhMzNkYzJkNmJiMGI2ZDgzM2Y3YmZkNTUyLzJkZjkwODMzMzhiNGFiZDZiY2ViNTYzNTc2NGRhYjVkODMzYjM5M2

***uses a subset of 1000 training examples to execute the notebook faster***

In [6]:
import json
data = []
with open("databricks-dolly-15k.jsonl") as file:
    for line in file:
        features = json.loads(line)
        # Filter out examples with context, to keep it simple.
        if features["context"]:
            continue
        # Format the entire example as a single string.
        template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
        data.append(template.format(**features))

# Only use 1000 training examples, to keep it fast.
data = data[:1000]

In [7]:
data[:10]

['Instruction:\nWhich is a species of fish? Tope or Rope\n\nResponse:\nTope',
 'Instruction:\nWhy can camels survive for long without water?\n\nResponse:\nCamels use the fat in their humps to keep them filled with energy and hydration for long periods of time.',
 "Instruction:\nAlice's parents have three daughters: Amy, Jessy, and what’s the name of the third daughter?\n\nResponse:\nThe name of the third daughter is Alice",
 'Instruction:\nWho gave the UN the land in NY to build their HQ\n\nResponse:\nJohn D Rockerfeller',
 'Instruction:\nWhy mobile is bad for human\n\nResponse:\nWe are always engaged one phone which is not good.',
 'Instruction:\nWhat is a polygon?\n\nResponse:\nA polygon is a form in Geometry.  It is a single dimensional plane made of connecting lines and any number of vertices.  It is a closed chain of connected line segments or edges.  The vertices of the polygon are formed where two edges meet.  Examples of polygons are hexagons, pentagons, and octagons.  Any plan

https://keras.io/api/keras_nlp/models/

In [8]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
gemma_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/config.json...
100%|██████████| 555/555 [00:00<00:00, 1.20MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/model.weights.h5...
100%|██████████| 4.67G/4.67G [04:03<00:00, 20.6MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/tokenizer.json...
100%|██████████| 401/401 [00:00<00:00, 337kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/assets/tokenizer/vocabulary.spm...
100%|██████████| 4.04M/4.04M [00:01<00:00, 2.88MB/s]


**General Prompt**

In [10]:
prompt = template.format(
    instruction="explain how respiration work in human body step by step?",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
explain how respiration work in human body step by step?

Response:
Respiration is the process of taking in oxygen and releasing carbon dioxide. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organisms. It is a process that occurs in all living organ

In [11]:
prompt = template.format(
    instruction="what is the importance of chlorophyll in plant?",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
what is the importance of chlorophyll in plant?

Response:
Chlorophyll is a green pigment found in plants that is responsible for the absorption of light energy. It is essential for photosynthesis, the process by which plants convert sunlight into energy. Chlorophyll also helps plants to absorb other nutrients, such as carbon dioxide and water, and to transport them throughout the plant. Chlorophyll also helps to protect plants from damage caused by ultraviolet radiation and other environmental stressors.


**Fine-Tuning using LORA**

*A lower rank means less computational overhead, but potentially less precise adaptation.*

In [12]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

In [13]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
gemma_lm.fit(data, epochs=1, batch_size=1)

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1404s[0m 1s/step - loss: 0.4588 - sparse_categorical_accuracy: 0.5237


<keras.src.callbacks.history.History at 0x7f28a6e4c460>

In [14]:
prompt = template.format(
    instruction="explain how respiration work in human body step by step?",
    response="",
)
print(gemma_lm.generate(prompt, max_length=256))

Instruction:
explain how respiration work in human body step by step?

Response:
Respiration occurs in human body when oxygen is consumed for the production of energy by burning the food. During respiration, the oxygen enters the body through the nose and lungs, and it is transported to the cells of the body through the bloodstream. In the lungs, the oxygen is exchanged with carbon dioxide, and then, it is transported to the cells. The carbon dioxide produced during the burning of food is transported out of the cells and out of the body. In the body, the blood contains oxygen, and when it reaches the tissues of the body, it combines with the food particles to form glucose, which is then transported to the mitochondria for the energy production.
