In [None]:
# Konkani Conversational AI Fine-Tuning Script
# ---------------------------------------------
# This script demonstrates how to fine-tune a pre-trained transformer model
# for conversational tasks in Konkani using an English-Konkani dataset.
# It uses the Hugging Face libraries (transformers, datasets) and PyTorch.

# Step 1: Install necessary libraries
# -----------------------------------
# Before running, make sure you have these libraries installed:
# pip install torch transformers datasets pandas sentencepiece accelerate

import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
from datasets import Dataset as HfDataset
import os

# --- CONFIGURATION ---
# --- Please edit these variables to match your setup ---

# File and Data Configuration
FILE_PATH = 'final_dataset.csv'  # <-- IMPORTANT: Change this to the exact path of your CSV file.
INPUT_COLUMN = 'ENGLISH'         # <-- The column with the source language (English)
TARGET_COLUMN = 'KONKANI'        # <-- The column with the target language (Konkani)

# Model Configuration
MODEL_NAME = 't5-small'          # We use 't5-small' for a balance of performance and training speed.
                                 # For higher quality, consider 't5-base' or 't5-large' (requires more GPU memory).
PREFIX = "chat in konkani: "     # T5 models are trained with prefixes. This tells the model what task to perform.

# Training Configuration
OUTPUT_DIR = './konkani_t5_model' # Directory to save the fine-tuned model and tokenizer.
EPOCHS = 5                       # Number of times to train on the entire dataset.
BATCH_SIZE = 8                   # Number of examples per training step. Adjust based on your GPU memory.
MAX_INPUT_LENGTH = 128           # Max length for input tokens.
MAX_TARGET_LENGTH = 128          # Max length for output tokens.

# --- END OF CONFIGURATION ---


def load_and_prepare_data():
    """
    Loads the dataset from the CSV file and prepares it for the model.
    It handles potential file errors and formats the data into a
    Hugging Face Dataset object.
    """
    print(f"Attempting to load data from: {FILE_PATH}")
    try:
        # Load the dataset using pandas, specifying the encoding
        df = pd.read_csv(FILE_PATH, encoding='latin1')

        # Verify that the required columns exist
        if INPUT_COLUMN not in df.columns or TARGET_COLUMN not in df.columns:
            raise ValueError(
                f"CSV must contain '{INPUT_COLUMN}' and '{TARGET_COLUMN}' columns. "
                f"Found columns: {df.columns.tolist()}"
            )
        print("Successfully loaded the dataset.")
        print(f"Dataset has {len(df)} rows.")

        # Drop any rows with missing values in our target columns
        df.dropna(subset=[INPUT_COLUMN, TARGET_COLUMN], inplace=True)
        df = df.astype(str) # Ensure all data is string type

        # Convert the pandas DataFrame to a Hugging Face Dataset object
        return HfDataset.from_pandas(df)

    except FileNotFoundError:
        print("---")
        print(f"ERROR: The file '{FILE_PATH}' was not found.")
        print("Please make sure the file path in the script is correct.")
        print("---")
        return None
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None


def preprocess_data(dataset, tokenizer):
    """
    Tokenizes the input and target text. The T5 model requires a specific
    prefix to understand the task, which we add here.
    """
    print("Preprocessing and tokenizing data...")

    def tokenize_function(examples):
        # Prepend the task-specific prefix to the input
        inputs = [PREFIX + doc for doc in examples[INPUT_COLUMN]]

        # Tokenize the inputs
        model_inputs = tokenizer(
            inputs,
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding="max_length"
        )

        # Tokenize the targets (labels)
        labels = tokenizer(
            text_target=examples[TARGET_COLUMN],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Apply the tokenization function to the entire dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    print("Data preprocessing complete.")
    return tokenized_dataset


def train_model(tokenized_dataset, tokenizer):
    """
    Sets up the model, training arguments, and trainer, then
    initiates the fine-tuning process.
    """
    print("Setting up the model and trainer...")

    # Load the pre-trained T5 model
    model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

    # Define the training arguments
    # These arguments control various aspects of the training run
    training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE, # If you add an evaluation set
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_total_limit=2, # Only keep the best 2 checkpoints
        predict_with_generate=True,
        report_to="none", # --- THIS LINE WAS ADDED --- Disables wandb logging
    )

    # The data collator is responsible for creating batches of data.
    # It also handles dynamic padding.
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model
    )

    # Instantiate the trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        # You can add an eval_dataset here for metrics during training
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    print("--- Starting Model Training ---")
    print(f"Model: {MODEL_NAME}")
    print(f"Epochs: {EPOCHS}, Batch Size: {BATCH_SIZE}")
    print(f"Output will be saved to: {OUTPUT_DIR}")

    # Start the training
    trainer.train()

    print("--- Training Complete ---")

    # Save the final model and tokenizer
    print(f"Saving the fine-tuned model to {OUTPUT_DIR}...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Model saved successfully.")


def run_conversation_loop():
    """
    Loads the fine-tuned model from disk and starts a conversational
    loop to interact with it.
    """
    print("\n--- Starting Konkani Chatbot ---")

    # Check if the model directory exists
    if not os.path.exists(OUTPUT_DIR):
        print(f"Error: Model directory '{OUTPUT_DIR}' not found.")
        print("Please train the model first by running the main script.")
        return

    print(f"Loading fine-tuned model from {OUTPUT_DIR}...")
    try:
        tokenizer = T5Tokenizer.from_pretrained(OUTPUT_DIR)
        model = T5ForConditionalGeneration.from_pretrained(OUTPUT_DIR)
        # Move model to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        print(f"Model loaded successfully on device: {device}")
    except Exception as e:
        print(f"Failed to load the model: {e}")
        return

    print("\nChatbot is ready. Type your message in English.")
    print("Type 'exit' or 'quit' to end the conversation.")
    print("--------------------------------------------------")

    while True:
        english_prompt = input("You (English): ")
        if english_prompt.lower() in ['exit', 'quit']:
            print("Bot: Adeus! (Goodbye!)")
            break

        # Prepare the input for the model
        full_prompt = PREFIX + english_prompt
        input_ids = tokenizer.encode(full_prompt, return_tensors='pt').to(device)

        # Generate the response from the model
        # We use different parameters here to encourage more creative responses
        output_ids = model.generate(
            input_ids,
            max_length=MAX_TARGET_LENGTH,
            num_beams=5, # Beam search can produce better results
            early_stopping=True,
            no_repeat_ngram_size=2 # Prevents the model from repeating itself
        )

        # Decode the generated tokens into a string
        konkani_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        print(f"Bot (Konkani): {konkani_response}")


if __name__ == '__main__':
    # This block runs when the script is executed directly

    # Ask the user what they want to do
    print("Welcome to the Konkani AI Model Trainer.")
    print("1: Train a new model")
    print("2: Chat with an existing model")
    choice = input("Please enter your choice (1 or 2): ")

    if choice == '1':
        # --- Main Training Pipeline ---
        # 1. Load data
        raw_dataset = load_and_prepare_data()

        if raw_dataset:
            # 2. Initialize tokenizer
            tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

            # 3. Preprocess and tokenize data
            tokenized_dataset = preprocess_data(raw_dataset, tokenizer)

            # 4. Train the model
            train_model(tokenized_dataset, tokenizer)

            # 5. Offer to chat with the newly trained model
            chat_now = input("Training complete. Would you like to chat with the new model? (yes/no): ")
            if chat_now.lower() == 'yes':
                run_conversation_loop()

    elif choice == '2':
        # --- Run Inference ---
        run_conversation_loop()
    else:
        print("Invalid choice. Please run the script again and enter 1 or 2.")

