In [None]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader, DirectoryLoader,CSVLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
%pwd

'/Users/pratikshankar/code/genai_projects/agri_farm_bot/research'

In [7]:
import os
os.chdir("../")

In [None]:
df_agr=pd.read_csv('Data/agriculture_mock_data.csv')
df_dairy=pd.read_csv('Data/dairy_mock_data.csv')
df=pd.concat([df_agr, df_dairy], axis=0)

In [11]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the API key
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")

In [None]:
df_agr=pd.read_csv('Data/mock_agriculture_farm_10yrs.csv')


In [None]:
df_agr.groupby('Year')['Yield_quintals'].mean().plot(title='Average Yield per Year', xlabel='Year', ylabel='Yield')

### chunking and vectorizing the knowledge data

In [None]:
def load_pdf_file(data):
    loader_pdf=DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents_pdf = loader_pdf.load()

    loader_csv=DirectoryLoader(data, glob="*.csv", loader_cls=CSVLoader)
    documents_csv = loader_csv.load()
    

    loader_txt=DirectoryLoader(data, glob="*.txt",loader_cls=TextLoader)
    documents_txt = loader_txt.load()

    documents = documents_csv+documents_pdf+documents_txt



    return documents

In [45]:
extracted_data=load_pdf_file(data="Data/")


In [52]:
lst=[]
for d in extracted_data:
    if d.metadata['source'].endswith('.txt'):
        if d.metadata['source'] not in lst:
            lst.append(d.metadata['source'])
            print("CSV File",d.metadata['source'])
      
        

CSV File Data/mock_data_farm_2 years.txt


In [46]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len,
        
    )
    texts = text_splitter.split_documents(extracted_data)
    return texts

In [47]:
text_chunks=text_split(extracted_data=extracted_data)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 1000


In [48]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')#'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [49]:
embeddings = download_hugging_face_embeddings()


In [None]:
embeddings

In [None]:
test_embed=embeddings.embed_query("What is the capital of France?")
print(test_embed)
print(len(test_embed))

In [50]:
from pinecone import ServerlessSpec
from pinecone import Pinecone
index_name="agri-rag-bot"


pc=Pinecone(api_key=PINECONE_API_KEY)
from pinecone import ServerlessSpec
if not pc.has_index(name=index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='dotproduct',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
index=pc.Index(name=index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {'': {'vector_count': 17104}},
 'total_vector_count': 17104,
 'vector_type': 'dense'}

In [51]:
documents = load_pdf_file('Data/')
text_chunks = text_split(documents)
embeddings = download_hugging_face_embeddings()
print("Proceeding with embeddings")

embedded_vectors = embeddings.embed_documents([chunk.page_content for chunk in text_chunks])
print("Embeddings generated")
def clean_metadata(meta: dict) -> dict:
    return {
        "text": meta.get("text", "")[:500],
        "source": meta.get("source", "unknown"),
        "page": meta.get("page", -1) if meta.get("page") is not None else -1
    }
# Prepare vectors for Pinecone
vectors = []
for i, (text_chunk, vector) in enumerate(zip(text_chunks, embedded_vectors)):
    
        metadata= {
            "text": text_chunk.page_content[:500],  # Keep metadata light!
            "source": text_chunk.metadata.get("source"),
            "page": text_chunk.metadata.get("page")
        }
        vectors.append({
        "id": f"chunk-{i}",
        "values": vector,
        "metadata": clean_metadata(metadata)
    })
    
print("Vectors prepared for Pinecone")
# === Batch Upload Safely ===
def batch_upsert(index, vectors, batch_size=50):
    for i in range(0, len(vectors), batch_size):
        print(f"Uploading batch {i // batch_size + 1} of {len(vectors) // batch_size + 1}")
        batch = vectors[i:i+batch_size]
        index.upsert(vectors=batch)
# index_name="agri-rag-bot"
batch_upsert(index, vectors, batch_size=40)  # 40 is a safe number for 768 dim
print("✅ All chunks uploaded safely without exceeding Pinecone size limits.")

Proceeding with embeddings
Embeddings generated
Vectors prepared for Pinecone
Uploading batch 1 of 26
Uploading batch 2 of 26
Uploading batch 3 of 26
Uploading batch 4 of 26
Uploading batch 5 of 26
Uploading batch 6 of 26
Uploading batch 7 of 26
Uploading batch 8 of 26
Uploading batch 9 of 26
Uploading batch 10 of 26
Uploading batch 11 of 26
Uploading batch 12 of 26
Uploading batch 13 of 26
Uploading batch 14 of 26
Uploading batch 15 of 26
Uploading batch 16 of 26
Uploading batch 17 of 26
Uploading batch 18 of 26
Uploading batch 19 of 26
Uploading batch 20 of 26
Uploading batch 21 of 26
Uploading batch 22 of 26
Uploading batch 23 of 26
Uploading batch 24 of 26
Uploading batch 25 of 26
✅ All chunks uploaded safely without exceeding Pinecone size limits.


In [13]:
#initializing docsearch with the existing index
from langchain_pinecone import PineconeVectorStore
docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [14]:
retriver=docsearch.as_retriever(search_type="similarity",search_kwargs={"k": 100})


In [None]:
query="What is my farm's yeild per year look like?"
retriver.invoke(input=query)

In [15]:
### initilizing the LLM
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-001",
    api_key=GEMINI_API_KEY,
    temperature=0.8,
    max_tokens=1024,
    timeout=None,
    max_retries=2
)

In [61]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
# from langchain.embeddings import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI


# system_prompt=(
#     "You are an aggriculture expert.Answer teh farmer's query based on the data provided to you.You have farm specific history and the knowledge texts.Answer Concisely so that the farmer can understand.Also mention the source of the result{context} "
# )
system_prompt = ("""
You are AgriBot, a multilingual expert assistant trained on detailed agricultural and dairy farm records. 
You have access to embedded farm data including crop cultivation logs, climate records, fertilizer usage, cow health records, milk production logs, and disease history from PDFs and CSV files.

Your job is to:
1. Analyze this farm-specific knowledge base.
2. Answer user questions accurately using only the information from these documents.
3. If the query is a summarization request, provide concise summaries based on the relevant data.
4. If the answer is not present in the data, respond with "The required information is not available in the farm documents."
5. Support both English and local language queries (Kannada, Hindi, Marathi, etc.) using multilingual understanding.

Always be concise, clear, and factual{context}.
""")
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriver, question_answer_chain)

# Query
query = "how to treat brucellosis?"
response = rag_chain.invoke({"input": query})

print(response["answer"])

Bacillus anthracis is sensitive to a number of antibiotics and treatment of animals during the early stages of the disease is likely to be successful, although severely ailing animals are unlikely to recover.


In [23]:
from deep_translator import GoogleTranslator

supported_langs = {
    "hi": "hindi",
    "mr": "marathi",
    "kn": "kannada",
    "en": "english"
}

def detect_and_translate_to_english(text):
    return GoogleTranslator(source='auto', target='english').translate(text)

def translate_from_english(text, target_lang_code="hi"):
    target_lang = supported_langs.get(target_lang_code, "english")
    return GoogleTranslator(source='english', target=target_lang).translate(text)


In [63]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
# from langchain.embeddings import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from google.cloud import translate_v2 as translate


system_prompt=(
    "You are an aggriculture expert.Answer teh farmer's query based on the data provided to you.You have farm specific history and the knowledge texts.Answer Concisely so that the farmer can understand.Also mention the source of the result{context} "
)
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriver, question_answer_chain)

# Query
query = "दूध में फैट बढ़ाने के उपाय बताइए?"



response = rag_chain.invoke({"input": query})


print(response["answer"])

दूध में वसा की मात्रा बढ़ाने के कुछ उपाय इस प्रकार हैं:

* **संतुलित आहार:** गाय को उचित मात्रा में हरा चारा, सूखा चारा और दाना खिलाएं। आहार में फाइबर की मात्रा पर्याप्त होनी चाहिए।
* **वसा युक्त आहार:** आहार में तेल बीज, सोयाबीन, सूरजमुखी के बीज या अलसी जैसे वसा युक्त खाद्य पदार्थों को शामिल करें।
* **पानी:** गाय को पर्याप्त मात्रा में पानी पिलाएं।
* **स्वस्थ रखें:** गाय को बीमारियों से बचाएं, खासकर मास्टिटिस से, जो दूध की गुणवत्ता को प्रभावित कर सकती है।
* **नियमित दूध दुहना:** गाय को नियमित अंतराल पर पूरी तरह से दुहना चाहिए। अपूर्ण दुहने से दूध में वसा की मात्रा कम हो सकती है।
* **दुहने का समय:** दूध में वसा की मात्रा दुहने की प्रक्रिया के दौरान लगातार बढ़ती है। पहले निकाले गए दूध में केवल 1-2% वसा हो सकती है, जबकि अंत में निकाले गए दूध में 5-10% वसा हो सकती है।

इन उपायों को अपनाकर आप अपनी गाय के दूध में वसा की मात्रा बढ़ा सकते हैं।

**स्रोत:**

*   पशुपालन विशेषज्ञ
