<a href="https://colab.research.google.com/github/sindhguvi/tamilcolloqhack/blob/main/tamcollqslm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install transformers datasets torch accelerate

import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset from Hugging Face (Replace with your actual dataset name)
dataset = load_dataset("sindhujasan/tamilcollos")  # Change 'your_username'

# Load pre-trained model and tokenizer (Use smaller model for faster training)
model_name = "facebook/blenderbot-small-90M"  # Smaller model than 400M
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Tokenization function
def preprocess_function(examples):
    inputs = [f"{tamil} Meaning: {meaning}" for tamil, meaning in zip(examples["Tamil Word"], examples["Meaning"])]
    targets = examples["Usage in Tamil"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs

# Tokenize dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split into train & validation sets (fallback to 'train' if 'validation' is missing)
if "validation" in tokenized_datasets:
    train_dataset = tokenized_datasets["train"]
    test_dataset = tokenized_datasets["validation"]
else:
    print("Warning: 'validation' split not found. Using 'train' split for evaluation.")
    train_dataset = tokenized_datasets["train"]
    test_dataset = train_dataset  # Use train split for evaluation if validation is missing

# Optional: Use a **smaller** dataset for **faster testing**
small_train_dataset = train_dataset.select(range(min(100, len(train_dataset))))  # First 100 samples
small_test_dataset = test_dataset.select(range(min(20, len(test_dataset))))  # First 20 samples

# Training settings (Optimized for speed)
training_args = TrainingArguments(
    output_dir="./results",
    save_total_limit=2,         # Keep only the latest 2 checkpoints
    save_steps=500,             # Save model every 500 steps
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,         # Train for 1 epoch (change if needed)
    logging_steps=100,
    logging_dir="./logs",
    fp16=True,  # Enable 16-bit precision for faster GPU training
    push_to_hub=False  # Avoid errors if not logged into Hugging Face
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  # Use small dataset for quick testing
    eval_dataset=small_test_dataset,
)

# Resume training from last checkpoint if available
last_checkpoint = "./results/checkpoint-last"
if os.path.exists(last_checkpoint):
    print("Resuming from last checkpoint...")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting training from scratch...")
    trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_tamil_colloq_bot")
tokenizer.save_pretrained("./fine_tuned_tamil_colloq_bot")

# Load fine-tuned model for chatbot
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_tamil_colloq_bot")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_tamil_colloq_bot")

# Chatbot loop
print("Tamil Colloquial Chatbot is ready! Type 'exit' to end the chat.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    # Tokenize user input
    inputs = fine_tuned_tokenizer(user_input, return_tensors="pt").to(device)

    # Generate response
    reply_ids = fine_tuned_model.generate(**inputs)
    response = fine_tuned_tokenizer.decode(reply_ids[0], skip_special_tokens=True)

    print(f"Chatbot: {response}")


Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.