In [55]:
from langchain_community.llms import Ollama
from ddgs import DDGS
import chromadb
# Load your local Ollama model
llm = Ollama(model="llama3.2")  

hroma_client = chromadb.PersistentClient(path="./web_memory")
collection = chroma_client.get_or_create_collection("web_knowledge")

# --- Web Search Function ---
def web_search(query, n=2):
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(query, max_results=n)]
    return results

# --- Agent Decision Function ---
def agent_answer(query):
    # Step 1: Check DB for past info
    matches = collection.query(query_texts=[query], n_results=1)
    print(matches)
    if matches["documents"] and any(matches["documents"][0]):
        context = "\n".join(matches["documents"][0])
        print("üìÇ Answering from DB memory...")
    else:
        # Step 2: If no DB match, do web search
        print("üåê Doing web search...")
        results = web_search(query, n=3)
        context = "\n".join([r["body"] for r in results])
        # Store in DB for future use
        for i, r in enumerate(results):
            collection.add(
                documents=[r["body"]],
                ids=[f"{query[:30]}_{i}"]  # unique ID
            )

    # Step 3: Ask Ollama with context
    prompt = f"""
    You are a research assistant.
    The user asked: {query}

    Context (from memory or web search):
    {context}

    Please provide a helpful and concise answer.
    """
    return llm.invoke(prompt)

# --- Example Run ---
question = "which is most innovative dairy technology in 2025?"
answer = agent_answer(question)
print("\nü§ñ Final Answer:\n", answer)

{'ids': [['What are the latest dairy farm_1']], 'embeddings': None, 'documents': [['In 2025 , dairy producers are rapidly embracing technologies that optimize feeding regimes, bolster animal health, and maximize milk production, including:']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None]], 'distances': [[0.45516690611839294]]}
üìÇ Answering from DB memory...

ü§ñ Final Answer:
 Based on current trends and advancements in the dairy industry, some of the most innovative dairy technology in 2025 include:

1. **Artificial Intelligence (AI) and Machine Learning (ML) for Predictive Analytics**: AI-powered systems that analyze data from various sources, such as sensor readings, cow behavior, and production records, to predict milk production, detect potential health issues, and optimize feeding regimes.
2. **Precision Feeding Systems with Automated Nutrition Management**: Advanced computerized feeding systems that use machine learning

In [76]:
from langchain_community.llms import Ollama
from ddgs import DDGS
import chromadb
import spacy

# Load your local Ollama model
llm = Ollama(model="llama3.2")  

chroma_client = chromadb.PersistentClient(path="./web_memory")
collection = chroma_client.get_or_create_collection("web_knowledge")

# --- Initialize spaCy ---
nlp = spacy.load("en_core_web_sm")

def extract_keywords(text):
    """Extract lemmas (keywords) using spaCy."""
    doc = nlp(text)
    return {token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop}

# --- Web Search Function ---
def web_search(query, n=5):
    with DDGS() as ddgs:
        results = [r for r in ddgs.text(query, max_results=n)]
    return results

# --- Agent Decision Function ---
def agent_answer(query):
    query_keywords = extract_keywords(query)

    # Step 1: Check DB for past info
    matches = collection.query(query_texts=[query], n_results=1)
    context = ""
    use_db = False

    if matches["documents"] and any(matches["documents"][0]):
        candidate_text = " ".join(matches["documents"][0])
        candidate_keywords = extract_keywords(candidate_text)

        overlap = query_keywords.intersection(candidate_keywords)
        print(f"üîë Keyword overlap: {len(overlap)}")

        if len(overlap) >= 3:  # ‚úÖ Require at least 5 keyword matches
            context = candidate_text
            use_db = True
            print("üìÇ Answering from DB memory...")
    
    if not use_db:
        # Step 2: If no strong DB match, do web search
        print("üåê Doing web search...")
        results = web_search(query, n=3)
        context = "\n".join([r["body"] for r in results])
        # Store in DB for future use
        for i, r in enumerate(results):
            collection.add(
                documents=[r["body"]],
                ids=[f"{query[:30]}_{i}"]  # unique ID
            )

    # Step 3: Ask Ollama with context
    prompt = f"""
    You are a research assistant.
    The user asked: {query}

    Context (from memory or web search):
    {context}

    Please provide a helpful and concise answer.
    """
    return llm.invoke(prompt)



In [78]:
question = "which is the most popular dairy technolgy used in everyday farming,?"
answer = agent_answer(question)
print("\nü§ñ Final Answer:\n", answer)

üîë Keyword overlap: 1
üåê Doing web search...

ü§ñ Final Answer:
 Based on the latest technological trends in dairy farming, I can conclude that one of the most popular dairy technologies used in everyday farming is Automated Milking Systems (AMS). AMS uses robotic milking machines to milk cows 24/7, reducing labor costs and increasing efficiency. Additionally, electronic sensors such as activity monitors help detect when cows are in heat, improving reproductive health.

This technology has become increasingly common on modern dairy farms due to its ability to optimize milk production while also promoting animal welfare and environmental sustainability.


In [81]:
import os
import pandas as pd
from langchain_community.llms import Ollama

# Initialize LLM
llm = Ollama(model="llama3.2")

# Directory containing CSV/Excel files
DATA_DIR = "./data_files"

# Step 1: List relevant files
def find_csv_files(keyword=None):
    files = [f for f in os.listdir(DATA_DIR) if f.endswith((".csv", ".xlsx"))]
    if keyword:
        files = [f for f in files if keyword.lower() in f.lower()]
    return files

# Step 2: Infer schema
def analyze_file(file_path):
    df = pd.read_csv(file_path) if file_path.endswith(".csv") else pd.read_excel(file_path)
    schema = {col: str(dtype) for col, dtype in df.dtypes.items()}
    return df, schema

# Step 3: Generate Python script from user query
import re

def generate_script(query, df_variable="df"):
    prompt = f"""
    User query: {query}
    Dataframe variable: {df_variable}
    Columns and types: {df_variable}.dtypes.to_dict()
    Generate a Python script using pandas to answer the query.Only respond with code as plain text without code block syntax around it
    """
    response = llm.invoke(prompt)
    
    # Extract code inside ```python ... ``` or ``` ... ```
    code_blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
    if code_blocks:
        script = code_blocks[0].strip()
    else:
        # If no code block detected, return entire response as fallback
        script = response.strip()
    
    return script

# Step 4: Execute safely
def execute_script(script, df):
    local_env = {"df": df}
    exec(script, {}, local_env)
    return local_env.get("result", None)

# Example usage
files = find_csv_files(keyword="milk")
df, schema = analyze_file(os.path.join(DATA_DIR, files[0]))
script = generate_script("Compute average milk yield per cow, unique cow id, total unique cow id", df_variable="df")
result = execute_script(script, df)
print(result)


Average Milk Yield per Cow:
cow_id
0      9.767582
1      9.445007
2      9.735094
3      9.645850
4      9.981165
5     10.176560
6      9.708028
7     10.026985
8     10.038964
9      9.703181
10     9.759811
11     9.358437
12     9.966342
13     9.764467
14     9.419822
15     9.967757
16    10.170426
17    10.014831
18    10.298662
19     9.102033
Name: milk_yield, dtype: float64

Unique Cow IDs: 20
None


In [83]:
import os
import pandas as pd
import re
from langchain_community.llms import Ollama

# Initialize LLM once
llm = Ollama(model="llama3.2")

def run_query(data_path, query, keyword=None):
    """
    Process CSV/Excel data files with an LLM-generated pandas script.
    
    Args:
        data_path (str): Path to directory containing data files.
        query (str): Natural language query to answer.
        keyword (str, optional): Filter files by keyword in filename.
    
    Returns:
        Any: Result from executing the generated script (stored in variable 'result').
    """
    
    # Step 1: List files
    files = [f for f in os.listdir(data_path) if f.endswith((".csv", ".xlsx"))]
    if keyword:
        files = [f for f in files if keyword.lower() in f.lower()]
    if not files:
        raise FileNotFoundError("No matching CSV/Excel files found.")

    # Step 2: Load first matching file
    file_path = os.path.join(data_path, files[0])
    df = pd.read_csv(file_path) if file_path.endswith(".csv") else pd.read_excel(file_path)
    
    # Step 3: Ask LLM for script
    prompt = f"""
    User query: {query}
    Dataframe variable: df
    Columns and types: {df.dtypes.to_dict()}
    Generate a Python script using pandas to answer the query.
    Only respond with code as plain text without code block syntax.
    """
    response = llm.invoke(prompt)

    # Extract code from response
    code_blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
    script = code_blocks[0].strip() if code_blocks else response.strip()

    # Step 4: Execute safely
    local_env = {"df": df}
    exec(script, {}, local_env)
    return local_env.get("result", None)

# Example usage:
DATA_DIR = "./data_files"
query = "Compute average milk yield per cow"
result = run_query(DATA_DIR, query, keyword="milk")
print("\nü§ñ Final Answer:\n", result)


cow_id
0      9.767582
1      9.445007
2      9.735094
3      9.645850
4      9.981165
5     10.176560
6      9.708028
7     10.026985
8     10.038964
9      9.703181
10     9.759811
11     9.358437
12     9.966342
13     9.764467
14     9.419822
15     9.967757
16    10.170426
17    10.014831
18    10.298662
19     9.102033
Name: milk_yield, dtype: float64

ü§ñ Final Answer:
 None


In [93]:
import os
import re
import pandas as pd
from langchain_community.llms import Ollama

# Initialize LLM
llm = Ollama(model="llama3.2")

def agent_answer(question):
    """
    Handle user query about datasets with automatic file selection and LLM script generation.
    User only needs to provide the question.
    
    Args:
        question (str): User's natural language question.
    
    Returns:
        Any: Result from executing the generated pandas script (variable 'result').
    """
    
    # Data directory (fixed internally)
    DATA_DIR = "./data_files"
    
    # Step 1: Infer keyword based on question
    keyword = None
    if "milk" in question.lower():
        keyword = "milk"
    
    # Step 2: Find matching files
    files = [f for f in os.listdir(DATA_DIR) if f.endswith((".csv", ".xlsx"))]
    if keyword:
        files = [f for f in files if keyword.lower() in f.lower()]
    if not files:
        raise FileNotFoundError(f"No matching files found for keyword '{keyword}'.")
    
    # Step 3: Load first file
    file_path = os.path.join(DATA_DIR, files[0])
    df = pd.read_csv(file_path) if file_path.endswith(".csv") else pd.read_excel(file_path)
    
    # Step 4: Generate script using LLM
    prompt = f"""
    User query: {question}
    Dataframe variable: df
    Columns and types: {df.dtypes.to_dict()}
    Generate a Python script using pandas to answer the query.
    Only respond with code as plain text without code block syntax or examples
    """
    response = llm.invoke(prompt)
    
    # Extract code from response
    code_blocks = re.findall(r"```(?:python)?\n(.*?)```", response, re.DOTALL)
    script = code_blocks[0].strip() if code_blocks else response.strip()
    
    # Step 5: Execute safely
    local_env = {"df": df}
    exec(script, {}, local_env)
    
    return local_env.get("result", None)

# ---------------- Example usage ----------------
question = "What is the average milk production per cow?"
result = ask_llm(question)
print("\nü§ñ Final Answer:\n", result)



ü§ñ Final Answer:
 None


In [95]:
question = "What is the average milk production for each cow?"
result = agent_answer(question)
print("\nü§ñ Final Answer:\n", result)

cow_id
0      9.767582
1      9.445007
2      9.735094
3      9.645850
4      9.981165
5     10.176560
6      9.708028
7     10.026985
8     10.038964
9      9.703181
10     9.759811
11     9.358437
12     9.966342
13     9.764467
14     9.419822
15     9.967757
16    10.170426
17    10.014831
18    10.298662
19     9.102033
Name: milk_yield, dtype: float64

ü§ñ Final Answer:
 None
