In [None]:
# ---------------------------------------------
# üì¶ Install Required Packages
# ---------------------------------------------
!pip install -q chromadb sentence-transformers trafilatura requests groq nltk pandas

In [None]:
# ---------------------------------------------
# üì• Import Libraries
# ---------------------------------------------
import requests
import trafilatura
import chromadb
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer
from groq import Groq

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# ---------------------------------------------
# üß† Setup ChromaDB and Embedder
# ---------------------------------------------
chroma_client = chromadb.PersistentClient(path="./chroma_store_combined")
web_collection = chroma_client.get_or_create_collection("web_context")
qa_collection = chroma_client.get_or_create_collection("qa_context")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ------------------------------------------------------------------------------
# üßæ Step 1: Load your QA dataset and store as embeddings
# ------------------------------------------------------------------------------
from tqdm import tqdm  # Progress bar

# Setup
batch_size = 500
qa_df = pd.read_csv("/content/questionsv4.csv").dropna(subset=["questions", "answers"])
# Batching and insertion
for i in tqdm(range(0, len(qa_df), batch_size)):
    batch_df = qa_df.iloc[i:i+batch_size]

    ids = [f"qa_{j}" for j in range(i, i + len(batch_df))]
    documents = batch_df["answers"].tolist()
    queries = batch_df["questions"].tolist()
    embeddings = embedder.encode(queries, batch_size=64).tolist()
    metadatas = [{"original_query": q} for q in queries]

    qa_collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeddings,
        metadatas=metadatas
    )


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 358/358 [28:32<00:00,  4.78s/it]


In [None]:
# ---------------------------------------------
# üöÄ Step 2: Take user query input
# ---------------------------------------------
query = input("Enter your farming query: ")


Enter your farming query: What pesticide should I use to control aphids in brinjal, and what is the dosage?"


In [None]:
# ---------------------------------------------
# üåê Step 3: Get top URLs from n8n webhook
# ---------------------------------------------
n8n_webhook_url = "https://rrajdev.app.n8n.cloud/webhook/farm-query"
response = requests.post(n8n_webhook_url, json={"query": query})

print("Status Code:", response.status_code)
print("Raw Response:", response.text)

try:
    urls = response.json().get("urls", [])
    print(f"\nTop {len(urls)} URLs Retrieved:\n", urls)
except Exception as e:
    print("Error parsing JSON:", e)
    urls = []


Status Code: 200
Raw Response: {"urls":["https://www.youtube.com/watch?v=320D-41xt-M&pp=0gcJCfwAo7VqN5tD","https://m.youtube.com/watch?v=BmLNAPT0gxw&t=345s","https://www.youtube.com/watch?v=bqCBIP9TmcY","https://kaybeebio.com/product/pesto-raze/","https://labelsds.com/images/user_uploads/Malathion%20.50%20Label.pdf"]}

Top 5 URLs Retrieved:
 ['https://www.youtube.com/watch?v=320D-41xt-M&pp=0gcJCfwAo7VqN5tD', 'https://m.youtube.com/watch?v=BmLNAPT0gxw&t=345s', 'https://www.youtube.com/watch?v=bqCBIP9TmcY', 'https://kaybeebio.com/product/pesto-raze/', 'https://labelsds.com/images/user_uploads/Malathion%20.50%20Label.pdf']


In [None]:
# ---------------------------------------------------------------
# üßπ Step 4: Extract and store scraped content in ChromaDB
# ---------------------------------------------------------------
def fetch_and_extract(url):
    try:
        response = requests.get(url, timeout=60, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code == 200:
            return trafilatura.extract(response.text)
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

cleaned = []
for url in urls:
    content = fetch_and_extract(url)
    if content:
        cleaned.append({"url": url, "content": content})

for i, entry in enumerate(cleaned):
    embedding = embedder.encode(entry["content"]).tolist()
    web_collection.add(
        ids=[f"web_{i}_{query[:10]}"],
        documents=[entry["content"]],
        embeddings=[embedding],
        metadatas=[{"source": entry["url"]}]
    )


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None


In [None]:
cleaned

[{'url': 'https://www.youtube.com/watch?v=320D-41xt-M&pp=0gcJCfwAo7VqN5tD',
  'content': 'About\nPress\nCopyright\nContact us\nCreators\nAdvertise\nDevelopers\nTerms\nPrivacy\nPolicy & Safety\nHow YouTube works\nTest new features\nNFL Sunday Ticket\n¬© 2025 Google LLC'},
 {'url': 'https://m.youtube.com/watch?v=BmLNAPT0gxw&t=345s',
  'content': 'About\nPress\nCopyright\nContact us\nCreators\nAdvertise\nDevelopers\nTerms\nPrivacy\nPolicy & Safety\nHow YouTube works\nTest new features\nNFL Sunday Ticket\n¬© 2025 Google LLC'},
 {'url': 'https://www.youtube.com/watch?v=bqCBIP9TmcY',
  'content': 'About\nPress\nCopyright\nContact us\nCreators\nAdvertise\nDevelopers\nTerms\nPrivacy\nPolicy & Safety\nHow YouTube works\nTest new features\nNFL Sunday Ticket\n¬© 2025 Google LLC'},
 {'url': 'https://kaybeebio.com/product/pesto-raze/',
  'content': 'Bio Pesticides Products Pesto Raze ‚Äì Best Insecticide for Aphids, White Flies, Jassids and Hoppers | Plant Insect and Pest Control Product\n‚Çπ322.00

In [None]:
# -----------------------------------------------------------------
# üîç Step 5: Retrieve top documents from BOTH collections
# ---------------------------------------------------------------
query_embedding = embedder.encode(query).tolist()

# Get top 2 web scraped contexts
web_results = web_collection.query(query_embeddings=[query_embedding], n_results=2)
web_contexts = web_results["documents"][0] if web_results["documents"] else []

# Get top 2 QA reference answers
qa_results = qa_collection.query(query_embeddings=[query_embedding], n_results=2)
qa_contexts = qa_results["documents"][0] if qa_results["documents"] else []

# Combine all context
combined_context = "\n".join(qa_contexts + web_contexts)

if not combined_context.strip():
    raise Exception("‚ùå No context found from either source.")

print("\n--- Combined Retrieved Context ---\n", combined_context[:1000])



--- Combined Retrieved Context ---
 suggested to apply amister @ 2 ml per litre of water.
spray malathion 50 ec @ 2 ml per liter of water
Broccoli aphids - Ask Extension
My broccoli became infested with aphids. I have been pulling the plants and disposing of them in the trash (not composting). Do I need to treat my g...
Knowledgebase
Broccoli aphids #853297
Asked October 17, 2023, 12:46 AM EDT
My broccoli became infested with aphids. I have been pulling the plants and disposing of them in the trash (not composting). Do I need to treat my garden beds now to keep them from next year‚Äôs garden? Is there an organic solution?
Benton County Oregon
Expert Response
Thanks for contacting "Ask Extension" about your aphid problem. You do not need to treat your garden beds.
Removing the broccoli and tossing the broccoli was the right move! Next you need to remove nearby weeds where aphids tend to overwinter (especially weeds related to the mustard family).
To control aphids in your garden:
- Kee

In [None]:
# ---------------------------------------------
# ü§ñ Step 6: Generate answer using Groq LLaMA
# ---------------------------------------------
client = Groq(api_key="Your_Groq_API_Key")

response = client.chat.completions.create(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are an agricultural expert. Use only the context to answer the question. Be specific and practical. Do not make things up."
        },
        {
            "role": "user",
            "content": f"Context:\n{combined_context}\n\nQuestion: {query}"
        }
    ],
    temperature=0.5,
    max_tokens=200
)

generated_answer = response.choices[0].message.content
print("\n--- Generated Answer ---\n", generated_answer)


--- Generated Answer ---
 For controlling aphids in brinjal, you have a couple of options:

1. **Amistar**: Apply Amistar at 2 ml per liter of water.
2. **Malathion 50 EC**: Spray Malathion 50 EC at 2 ml per liter of water.
3. **Pesto Raze**: This is an organic insecticide, and the recommended dosage for brinjal is 1.5-2.5 ml/liter of water.

It's always a good idea to check the product label and follow the instructions carefully. Also, consider integrated pest management strategies like introducing beneficial insects, removing weeds, and using physical barriers to control aphid populations.


In [None]:
import pandas as pd
import requests
from groq import Groq
from sentence_transformers import SentenceTransformer
import chromadb
import trafilatura
from tqdm import tqdm

# ---------------------------------------------
# üîß Setup
# ---------------------------------------------
client = Groq(api_key="Your_Groq_API_Key")  # Replace this
# ---------------------------------------------
# üì• Load QA Reference Dataset into ChromaDB
# ---------------------------------------------
df = pd.read_csv("/content/questionsv4.csv").dropna(subset=["questions", "answers"])
# ---------------------------------------------
# üì• Load Query Dataset (limit to 10 rows)
# ---------------------------------------------
df = df.head(10)
results = []

# ---------------------------------------------
# üîÅ Loop and generate answers
# ---------------------------------------------
def fetch_and_extract(url):
    try:
        response = requests.get(url, timeout=60, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code == 200:
            return trafilatura.extract(response.text)
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

for idx, row in df.iterrows():
    query = row["questions"]
    reference = row["answers"]
    print(f"\nüîÅ Processing Query {idx+1}: {query}")

    try:
        # üåê Step 1: Get URLs from n8n
        n8n_webhook_url = "https://rrajdev.app.n8n.cloud/webhook/farm-query"
        response = requests.post(n8n_webhook_url, json={"query": query})
        urls = response.json().get("urls", [])
        if not urls:
            print("‚ùå No URLs found.")
            continue

        # üßπ Step 2: Scrape content
        cleaned = []
        for url in urls:
            content = fetch_and_extract(url)
            if content:
                cleaned.append({"url": url, "content": content})

        if not cleaned:
            print("‚ùå No content scraped.")
            continue

        # üß† Step 3: Store cleaned web context in Chroma
        for i, entry in enumerate(cleaned):
            embedding = embedder.encode(entry["content"]).tolist()
            web_collection.add(
                ids=[f"web_{idx}_{i}"],
                documents=[entry["content"]],
                embeddings=[embedding],
                metadatas=[{"source": entry["url"]}]
            )

        # üîç Step 4: Retrieve context from BOTH sources
        query_embedding = embedder.encode(query).tolist()

        web_results = web_collection.query(query_embeddings=[query_embedding], n_results=2)
        web_contexts = web_results["documents"][0] if web_results["documents"] else []

        qa_results = qa_collection.query(query_embeddings=[query_embedding], n_results=2)
        qa_contexts = qa_results["documents"][0] if qa_results["documents"] else []

        combined_context = "\n".join(qa_contexts + web_contexts)
        safe_context = combined_context[:3000]

        if not combined_context.strip():
            print("‚ùå No relevant context found.")
            continue

        # ü§ñ Step 5: Ask Groq LLM
        response = client.chat.completions.create(
            model="meta-llama/llama-4-scout-17b-16e-instruct",
            messages=[
                {
                    "role": "system",
                    "content": "You are an agricultural extension officer. Based strictly on the context provided, give a specific, actionable answer. Include names of pesticides, dosage, agency names, schemes, or institutions if available in the context. Do not guess or make up answers not found in the context."
                },
                {
                    "role": "user",
                    "content": f"Context:\n{safe_context}\n\nQuestion: {query}"
                }
            ],
            temperature=0.5,
            max_tokens=150
        )

        answer = response.choices[0].message.content.strip()

        # ‚úÖ Step 6: Store results
        results.append({
            "query": query,
            "answer": answer,
            "reference": reference
        })

    except Exception as e:
        print(f"‚ö†Ô∏è Error processing query {idx+1}: {e}")
        continue

# ---------------------------------------------
# üíæ Save final results to CSV
# ---------------------------------------------
output_df = pd.DataFrame(results)
output_df.to_csv("groq_farming_with_ref_combined.csv", index=False)
print("\n‚úÖ Completed! Saved to 'groq_farming_with_ref_combined.csv'")



üîÅ Processing Query 1: asking about the control measure for aphid infestation in mustard crops




‚ö†Ô∏è Error processing query 1: no healthy upstream

üîÅ Processing Query 2: asking about the control measure of flower drop problem in his coconut plant




‚ö†Ô∏è Error processing query 2: no healthy upstream

üîÅ Processing Query 3: asking about how to avail kisan credit card loan for sali crop.




Error fetching https://agri.odisha.gov.in/sites/default/files/2024-07/Implementation%20of%20Pradhan%20Mantri%20Fasal%20Bima%20Yojana%20%28PMFBY%29%20during%20Kharif%202024_0.pdf: HTTPSConnectionPool(host='agri.odisha.gov.in', port=443): Read timed out. (read timeout=60)


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None



üîÅ Processing Query 4: asking about source of early ahu rice variety





üîÅ Processing Query 5: asking that he has not got proper friut from his coconut plant

üîÅ Processing Query 6: asking about induced breeding of fishes





üîÅ Processing Query 7: asking about training for preparation of biomanure


ERROR:trafilatura.utils:parsed tree length: 1, wrong data type or not valid HTML
ERROR:trafilatura.core:empty HTML tree: None



üîÅ Processing Query 8: asking about treatment of low production of milk in cow

üîÅ Processing Query 9: asking about the premature fruit dropping of coconut.





üîÅ Processing Query 10: asking  about  preservatives  of  tomato squash.





‚úÖ Completed! Saved to 'groq_farming_with_ref_combined.csv'


In [None]:
!pip install pandas requests --quiet

In [None]:
import requests
import pandas as pd

# üîë Set your Groq API Key here
GROQ_API_KEY = "Your_Groq_API_Key"  # Replace with your Groq API key

In [None]:
# üß† Evaluation function using Groq's LLaMA 3
def auto_evaluate_with_groq(query, answer, reference):
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json"
    }

    prompt = f"""
You are a neutral and strict evaluator.

Your task is to compare the chatbot's answer to a trusted reference answer for a farming question.

### Scoring Rubric (1 to 5):
- 5 = The chatbot‚Äôs answer is fully accurate, equivalent to the reference answer in meaning and usefulness.
- 4 = The answer is mostly accurate and close to the reference but misses 1 minor point or detail.
- 3 = The answer captures the general idea but is incomplete, vague, or less specific than the reference.
- 2 = The answer is mostly incorrect or significantly less informative than the reference.
- 1 = The answer is wrong, irrelevant, or contradicts the reference.

### Instructions:
- Evaluate the chatbot's answer **only in comparison to the reference answer**.
- Focus on correctness, completeness, and whether the user would get equally useful help from both.
- Be objective and avoid inflated scores.
- Then provide a brief 1-line comment justifying your scores.

### Output Format:
Return your evaluation in this exact JSON format:
{{
  "match": <1‚Äì5>,
  "helpfulness": <1‚Äì5>,
  "trustworthiness": <1‚Äì5>,
  "comments": "<Your one-line comment here>"
}}

---

Question:
{query}

Chatbot Answer:
{answer}

Reference Answer:
{reference}
"""


    payload = {
        "model": "llama3-70b-8192",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
        "max_tokens": 300
    }

    response = requests.post(url, headers=headers, json=payload)
    return response.json()['choices'][0]['message']['content']


In [None]:
# üì• Load dataset from CSV
df = pd.read_csv("/content/groq_farming_with_ref_combined.csv")

# üîÅ Evaluate each row and collect results
results = []

for idx, row in df.iterrows():
    print(f"Evaluating Query {idx+1}: {row['query']}")

    try:
        review = auto_evaluate_with_groq(row["query"], row["answer"], row["reference"])
    except Exception as e:
        review = f"Error during evaluation: {e}"

    results.append({
        "query": row["query"],
        "answer": row["answer"],
        "reference": row["reference"],
        "evaluation": review
    })

# üíæ Save evaluation results
eval_df = pd.DataFrame(results)
eval_df.to_csv("groq_evaluated_output_ref2.csv", index=False)

print("‚úÖ Evaluation complete. Results saved to 'groq_evaluated_output_ref.csv'")

Evaluating Query 1: asking about how to avail kisan credit card loan for sali crop.
Evaluating Query 2: asking about source of early ahu rice variety
Evaluating Query 3: asking that he has not got proper friut from his coconut plant
Evaluating Query 4: asking about induced breeding of fishes
Evaluating Query 5: asking about training for preparation of biomanure
Evaluating Query 6: asking about treatment of low production of milk in cow
Evaluating Query 7: asking about the premature fruit dropping of coconut.
Evaluating Query 8: asking  about  preservatives  of  tomato squash.
‚úÖ Evaluation complete. Results saved to 'groq_evaluated_output_ref.csv'
