In [None]:
! pip install PyPDF2
! pip install gTTS
! pip install langchain==0.1.14 langchain-core==0.1.38 pydantic==1.10.13

# RAG pipeline

In [9]:
# FINAL WORKING CODE


import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# 1. Enhanced Document Processing
def load_documents():
    documents = []
    pdf_folder = "/kaggle/input/rag-test"
    
    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            try:
                loader = PyPDFLoader(os.path.join(pdf_folder, file))
                documents.extend(loader.load_and_split())
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    
    # Better text splitting for legal documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        separators=["\n\n", "\n", "। ", "§", "(a)", "(b)"]
    )
    return text_splitter.split_documents(documents)

# 2. Fixed Pipeline Configuration
def create_qa_chain():
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda:0")

    # Correct generation parameters
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0,
        max_length=512,
        do_sample=True,
        temperature=0.7,
        top_k=40,
        truncation=True
    )

    # Improved prompts
    question_prompt = PromptTemplate(
        template="Context:\n{context}\n\nQuestion: {question}\nAnswer in simple Hindi/English:",
        input_variables=["context", "question"]
    )

    combine_prompt = PromptTemplate(
        template="Combine these answers clearly:\n{summaries}\n\nFinal Question: {question}\nDetailed Answer:",
        input_variables=["summaries", "question"]
    )

    # Create vector store
    vectorstore = FAISS.from_documents(
        load_documents(),
        HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    )

    return RetrievalQA.from_chain_type(
        llm=HuggingFacePipeline(pipeline=pipe),
        chain_type="map_reduce",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={
            "question_prompt": question_prompt,
            "combine_prompt": combine_prompt,
            "combine_document_variable_name": "summaries"
        },
        return_source_documents=True
    )

# 3. Smarter Answer Formatting
def farmer_friendly_answer(result):
    answer = result['result'].strip()
    
    # Check for empty answers
    if not answer or len(answer) < 20:
        return "Please visit your nearest Krishi Seva Kendra for details or Call KISAN CALL CENTER Toll Free No.1800-180-1551"
    
    # Simplify technical terms
    replacements = {
        "Sponsor": "Company",
        "agreement": "contract",
        "permanent structure": "permanent building",
        "temporary modification": "temporary change"
    }
    
    for old, new in replacements.items():
        answer = answer.replace(old, new)
    
    return answer

# Query Interface
def ask_farmer(query):
    qa = create_qa_chain()
    result = qa({"query": query})
    
    print("\n🌾" + "="*50 + "🌾")
    print(f"Question: {query}")
    print("🌱" + "-"*50 + "🌱")
    answer=farmer_friendly_answer(result)
    print("Answer:", answer)
    print("\nRelevant Sources:")
    for doc in result['source_documents']:
        print(f"• {os.path.basename(doc.metadata['source'])} (Page {doc.metadata['page']+1})")
    print("🌾" + "="*50 + "🌾\n")
    return answer

# Example Usage
if __name__ == "__main__":
    ask_farmer("Can companies build permanent buildings on my land?")
    ask_farmer("What is the farming agreement act?")

Device set to use cuda:0



Question: Can companies build permanent buildings on my land?
🌱--------------------------------------------------🌱
Answer: Please visit your nearest Krishi Seva Kendra for details or Call KISAN CALL CENTER Toll Free No.1800-180-1551

Relevant Sources:
• Guidelines farm services Act 2020 -converted_4.pdf (Page 8)
• Farming Agreement Final Act _4.pdf (Page 5)
• Guidelines farm services Act 2020 -converted_4.pdf (Page 8)



Device set to use cuda:0



Question: What is the farming agreement act?
🌱--------------------------------------------------🌱
Answer: National framework on farming contracts that protects and empowers farmers to engage with agri-business firms, processors, wholesalers, exporters or large retailers for farm services

Relevant Sources:
• Farming Agreement Final Act _4.pdf (Page 1)
• Guidelines farm services Act 2020 -converted_4.pdf (Page 1)
• Farming Agreement Final Act _4.pdf (Page 5)



In [35]:
texts=ask_farmer("what are latest schemes for farmers?")

Device set to use cuda:0



Question: what are latest schemes for farmers?
🌱--------------------------------------------------🌱
Answer: FARMERS (EMPOWERMENT AND PROTECTION) AGREEMENT ON PRICE ASSURANCE AND FARM SERVICES ACT, 2020

Relevant Sources:
• Farming Agreement Final Act _4.pdf (Page 1)
• Farming Agreement Final Act _4.pdf (Page 2)
• Guidelines farm services Act 2020 -converted_4.pdf (Page 1)



In [36]:
print(texts)


FARMERS (EMPOWERMENT AND PROTECTION) AGREEMENT ON PRICE ASSURANCE AND FARM SERVICES ACT, 2020


# English to Hindi and Punjabi Translation

In [37]:
# Import libraries
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load English to Hindi model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Function to translate text
def translate(texts):
    batch = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        generated = model.generate(**batch)
    return tokenizer.batch_decode(generated, skip_special_tokens=True)

In [38]:
# Translate and print results
hindi = translate(texts)
for eng, trans in zip(texts, hindi):
    print(f"{eng} → {trans}")

F → FARM लोग (प्रयोग और लाभ) लाभ और FAR सेवा ATCT, 2020


In [39]:
hindi

['FARM लोग (प्रयोग और लाभ) लाभ और FAR सेवा ATCT, 2020']

In [44]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# text = "How are you?"
tokenizer.src_lang = "en"
encoded = tokenizer(texts, return_tensors="pt")
generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id("pa"))
punjabi=tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(punjabi)

ਉਤਪਾਦ ਜ pricelist ਬਾਰੇ ਪੁੱਛ-ਗਿੱਛ ਲਈ, ਸਾਡੇ ਲਈ ਆਪਣੇ ਈ-ਮੇਲ ਨੂੰ ਛੱਡ, ਕਿਰਪਾ ਕਰਕੇ ਹੈ ਅਤੇ ਸਾਨੂੰ ਸੰਪਰਕ ਵਿੱਚ 24 ਘੰਟੇ ਦੇ ਅੰਦਰ-ਅੰਦਰ ਹੋ ਜਾਵੇਗਾ.


In [41]:
punjabi=tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(punjabi)

ਉਤਪਾਦ ਜ pricelist ਬਾਰੇ ਪੁੱਛ-ਗਿੱਛ ਲਈ, ਸਾਡੇ ਲਈ ਆਪਣੇ ਈ-ਮੇਲ ਨੂੰ ਛੱਡ, ਕਿਰਪਾ ਕਰਕੇ ਹੈ ਅਤੇ ਸਾਨੂੰ ਸੰਪਰਕ ਵਿੱਚ 24 ਘੰਟੇ ਦੇ ਅੰਦਰ-ਅੰਦਰ ਹੋ ਜਾਵੇਗਾ.


# Text to audio 

In [4]:
from gtts import gTTS
from IPython.display import Audio

# Hindi text
text = "हाँ, आप प्रधानमंत्री किसान सम्मान निधि (PM-KISAN) योजना के लिए पात्र हैं।  इस योजना के तहत आपको सालाना ₹6,000 तीन किश्तों में सीधे आपके बैंक खाते में मिलते हैं।  जरूरी दस्तावेज़ों में शामिल हैं:1. भूमि का दस्तावेज़ (जैसे खसरा-खतौनी) 2. आधार कार्ड 4. पहचान प्रमाण | आप नजदीकी CSC सेंटर पर जाकर आवेदन कर सकते हैं।"
language = "hi"

# Convert to speech
tts = gTTS(text=text, lang=language)
tts.save("output1.mp3")

# Play audio in the notebook
Audio("output1.mp3")


In [2]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4


In [43]:
# Example: Punjabi sentence

language = "pa"  # Specify the language code (e.g., 'hi' for Hindi)

# Convert text to speech
tts = gTTS(text=punjabi, lang=language)
tts.save("output1.mp3")

# Play audio in the notebook
Audio("output1.mp3")