<a href="https://colab.research.google.com/github/sindhguvi/tamilcolloqhack/blob/main/final_tamilcolloq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load dataset from Hugging Face
dataset = load_dataset("sindhujasan/tamilcolloq")

# Load pre-trained model and tokenizer
model_name = "facebook/blenderbot-400M-distill"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Tokenizing dataset
def preprocess_function(examples):
    inputs = [f"{tamil} Meaning: {meaning}" for tamil, meaning in zip(examples["Tamil Word"], examples["Meaning"])]
    targets = examples["Usage in Tamil"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split into train and validation sets
train_dataset = tokenized_datasets["train"]
try:
    test_dataset = tokenized_datasets["validation"]  # Prefer 'validation' if available
except KeyError:
    print("Warning: 'validation' split not found. Using 'train' split for evaluation.")
    test_dataset = train_dataset  # Fallback to 'train'

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_tamil_colloq_bot")
tokenizer.save_pretrained("./fine_tuned_tamil_colloq_bot")

# Load fine-tuned model
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_tamil_colloq_bot")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_tamil_colloq_bot")

# Chatbot interaction loop
print("Tamil Colloquial Chatbot is ready! Type 'exit' to end the chat.\n")

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    # Tokenize user input
    inputs = fine_tuned_tokenizer(user_input, return_tensors="pt")

    # Generate response
    reply_ids = fine_tuned_model.generate(**inputs)
    response = fine_tuned_tokenizer.decode(reply_ids[0], skip_special_tokens=True)

    print(f"Chatbot: {response}")






Epoch,Training Loss,Validation Loss
1,No log,3.752414
2,No log,2.828619
3,No log,2.545949




Tamil Colloquial Chatbot is ready! Type 'exit' to end the chat.

