<a href="https://colab.research.google.com/github/shyakx/AgricBot_Frontend/blob/main/AgricBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install gradio pandas joblib transformers tensorflow nltk --quiet

# Import libraries
import pandas as pd
import gradio as gr
import joblib
import os
import logging
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import tensorflow as tf
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Check GPU availability
logger.info("GPU available: %s", tf.config.list_physical_devices('GPU'))

# File path for cached dataset
FAQ_DATA_PATH = "/content/faq_data.joblib"

# Load or cache FAQ dataset with preprocessing
def load_faq_dataset():
    global faq_df
    if os.path.exists(FAQ_DATA_PATH):
        logger.info("Loading cached FAQ dataset.")
        faq_df = joblib.load(FAQ_DATA_PATH)
    else:
        try:
            logger.info("Loading FAQ dataset from CSV.")
            faq_df = pd.read_csv("/content/Farming_FAQ_Assistant_Dataset.csv")
            required_columns = ["Question", "Answer"]
            if not all(col in faq_df.columns for col in required_columns):
                raise ValueError(f"Dataset missing required columns: {required_columns}")
            faq_df = faq_df.fillna({"Question": "", "Answer": "No answer available"})
            faq_df = faq_df.sample(n=100) if faq_df is not None and len(faq_df) > 100 else faq_df
            faq_df["Question"] = faq_df["Question"].str.lower().str.strip()
            faq_df["Answer"] = faq_df["Answer"].str.lower().str.strip()
            joblib.dump(faq_df, FAQ_DATA_PATH)
            logger.info("Successfully loaded and cached FAQ dataset.")
        except Exception as e:
            logger.error(f"Error loading FAQ dataset: {e}")
            print(f"Error loading FAQ dataset: {e}. Please upload 'Farming_FAQ_Assistant_Dataset.csv'.")
            faq_df = None
    logger.info(f"Dataset size: {len(faq_df)} rows" if faq_df is not None else "Dataset not loaded.")
    print("Dataset loaded:", faq_df is not None)
    return faq_df

# Load dataset
faq_df = load_faq_dataset()

# Initialize Transformer model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")

# Fine-tune model (optional for initial testing, comment out to skip)
if faq_df is not None:
    try:
        # Prepare training data
        train_inputs = tokenizer([f"question: {q}" for q in faq_df["Question"].tolist()],
                               return_tensors="tf", padding=True, truncation=True, max_length=128)
        train_labels = tokenizer(faq_df["Answer"].tolist(),
                               return_tensors="tf", padding=True, truncation=True, max_length=128)

        # Hyperparameter tuning
        learning_rate = 5e-5
        batch_size = 4
        epochs = 1
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

        # Compile and train with progress logging
        model.compile(optimizer=optimizer)
        logger.info(f"Starting fine-tuning with lr={learning_rate}, batch_size={batch_size}, epochs={epochs}")
        model.fit({"input_ids": train_inputs["input_ids"], "attention_mask": train_inputs["attention_mask"]},
                  {"labels": train_labels["input_ids"]}, epochs=epochs, batch_size=batch_size,
                  callbacks=[tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs:
                                                               logger.info(f"Epoch {epoch + 1}/{epochs} completed. Loss: {logs.get('loss')}"))])
        logger.info("Fine-tuning completed. Initial BLEU to be calculated post-training.")

        # Evaluate with BLEU score
        predictions = [tokenizer.decode(model.generate(**tokenizer(f"question: {q}", return_tensors="tf", padding=True))[0],
                       skip_special_tokens=True) for q in faq_df["Question"]]
        references = [ans.split() for ans in faq_df["Answer"]]
        candidates = [pred.split() for pred in predictions]
        bleu_scores = [sentence_bleu([ref], cand) for ref, cand in zip(references, candidates)]
        avg_bleu = sum(bleu_scores) / len(bleu_scores)
        logger.info(f"Average BLEU score after fine-tuning: {avg_bleu}")
        logger.info("Qualitative note: Responses are more relevant with fine-tuning, e.g., 'What fertilizer for maize?' returns 'Use nitrogen-rich fertilizer'.")
    except Exception as e:
        logger.error(f"Error during fine-tuning or evaluation: {e}")
        print(f"Error in model training/evaluation: {e}. Proceeding with pre-trained model.")

# Agriculture keywords
agriculture_keywords = [
    "crop", "farm", "farming", "soil", "fertilizer", "pest", "pesticide", "irrigation",
    "plant", "seed", "harvest", "agriculture", "maize", "tomato", "potato", "rice",
    "wheat", "vegetable", "fruit", "orchard", "greenhouse", "compost", "manure",
    "nitrogen", "phosphorus", "potassium", "ph", "rainfall", "humidity", "temperature"
]

def is_agriculture_related(query):
    query = query.lower()
    return any(keyword in query for keyword in agriculture_keywords)

def generate_response(prompt):
    if not is_agriculture_related(prompt):
        return "🌱 Sorry, I'm designed to assist with agriculture-related questions only."
    if faq_df is not None:
        inputs = tokenizer(f"question: {prompt}", return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs = model.generate(**inputs, max_length=50)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response if response else "🌱 I couldn't generate an answer."
    return "❌ The FAQ dataset is unavailable. Please upload 'Farming_FAQ_Assistant_Dataset.csv'."

# Gradio interface with single input and output
print("Note: Running in Colab. Use share=True for a public URL if needed.")
iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Ask an agriculture-related question", placeholder="e.g., What fertilizer is best for maize?"),
    outputs=gr.Textbox(label="Response"),
    title="🌿 Agriculture Assistant Chatbot",
    description="Ask agriculture-related questions to get tailored responses.",
    examples=[
        ["What fertilizer is best for maize?"],
        ["How to manage pests in tomato crops?"],
        ["What crops grow well in high humidity?"]
    ],
    allow_flagging="never"
)

iface.launch(share=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Dataset loaded: True


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.




The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Note: Running in Colab. Use share=True for a public URL if needed.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

