<a href="https://colab.research.google.com/github/ronitroy05/AI-Translator/blob/main/LLM_3M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#  Optimized Training for Kine-3M (3M Parameter LLM) on TPU v2-8


!pip install torch transformers datasets accelerate peft torch_xla

import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch.optim as optim
from transformers import GPT2Config, AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader

# 1: TPU set up
device = xm.xla_device()
print("Using device:", device)

# 2: Define Model Configuration (3M Parameters)
config = GPT2Config(
    vocab_size=4000,
    n_positions=512,
    n_embd=128,
    n_layer=2,
    n_head=2,
    intermediate_size=512,
)

#  3: Initialize the model
model = AutoModelForCausalLM.from_config(config)
model.to(device)
print(f"Model Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

#  4: Load & Tokenize Dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format(type="torch", columns=["input_ids"])

batch_size = 4  # Reduced for stability
train_dataloader = DataLoader(tokenized_datasets, batch_size=batch_size, shuffle=True)

# Apply LoRA (Optimized Training)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,
    lora_alpha=16,
    lora_dropout=0.05
)

model = get_peft_model(model, peft_config)
model.to(device)

# Training Loop (Optimized for TPU)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)  # Lowered LR for better training

print("\n🚀 Training Kine-3M on TPU!\n")
model.train()

num_epochs = 5  # Increase training since it's not learning well
best_loss = float("inf")

for epoch in range(num_epochs):
    para_loader = pl.MpDeviceLoader(train_dataloader, device)

    total_loss = 0
    for batch in para_loader:
        optimizer.zero_grad()

        inputs = batch["input_ids"].to(device)
        labels = batch["input_ids"].to(device)

        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        xm.optimizer_step(optimizer)
        xm.mark_step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"✅ Epoch {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        best_loss = avg_loss
    else:
        print("⚠️ Loss increased, reducing learning rate!")
        for param_group in optimizer.param_groups:
            param_group["lr"] *= 0.8  # Reduce LR to stabilize training

print("\n✅ Training Complete!\n")

# 7: Save Kine-3M Model Safely
model_path = "Kine-3M.pth"
torch.save(model.state_dict(), model_path)
print("✅ Model saved successfully!")

# 8: Download the Model
from google.colab import files
files.download(model_path)

#  9: Test the Model
def generate_text(prompt):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(inputs["input_ids"], max_length=100, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_text("Once upon a time"))



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.6-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.2-py2.py3-none-any.whl.metada



Using device: xla:0
Model Parameters: 1.49M


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]




🚀 Training Kine-3M on TPU!

✅ Epoch 1/5 - Loss: nan
⚠️ Loss increased, reducing learning rate!
✅ Epoch 2/5 - Loss: nan
⚠️ Loss increased, reducing learning rate!
✅ Epoch 3/5 - Loss: nan
⚠️ Loss increased, reducing learning rate!
✅ Epoch 4/5 - Loss: nan
⚠️ Loss increased, reducing learning rate!
✅ Epoch 5/5 - Loss: nan
⚠️ Loss increased, reducing learning rate!

✅ Training Complete!

✅ Model saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
