In [3]:
!pip install pypdf



# Import and processs PDF data

In [4]:
from pypdf import PdfReader

reader = PdfReader("The indigenous badugar of the Nilgiris.pdf")

In [5]:
text = ""
for page in reader.pages:
    text += page.extract_text() + "\n"

for line in text.split("\n"):
    print(line)

  
25 
International Journal of Humanities and Social Science Research 
www.socialsciencejournal.in 
ISSN: 2455-2070 
Received: 19-12-2021, Accepted: 02-01-2022, Published: 19-01-2022 
Volume 8, Issue 1, 2022, Page No. 25-28 
The indigenous badugar of the Nilgiris 
H R Sumathi 
Assistant Professor, Department of History, Vellalar College for Women, Erode, Tamil Nadu, India 
 
 
Abstract 
Nilgiris district, also called as „The Nilgirs‟, hills‟, is one of the smallest District of Tamil Nadu. Etymologically the wo rd 
„Nilgiris‟ means, Blue Mountains‟. The district is a hill area of 2549.0 sq.kms. located between 11‟ 10‟ and 11‟30‟ N latitude 
and between 76‟25‟ and 77‟ 00‟E longitude at the junction of the Eastern and the Western Ghats, the two prominent mountain 
ranges that run almost parallel to the coastline of peninsular India. With an average elevation of 6500 ft. the Nilgiris Dist rict is 
bound on the west by Kerala on the north by Karnataka, and on the Southeast by Coimbatore di

In [6]:
# split data in chunks
chunks = []
N = len(text)
chunk_size = 500
for i in range(0, N, chunk_size):
    chunk = text[i: i + chunk_size]
    chunks.append(chunk)


In [7]:
chunks

['  \n25 \nInternational Journal of Humanities and Social Science Research \nwww.socialsciencejournal.in \nISSN: 2455-2070 \nReceived: 19-12-2021, Accepted: 02-01-2022, Published: 19-01-2022 \nVolume 8, Issue 1, 2022, Page No. 25-28 \nThe indigenous badugar of the Nilgiris \nH R Sumathi \nAssistant Professor, Department of History, Vellalar College for Women, Erode, Tamil Nadu, India \n \n \nAbstract \nNilgiris district, also called as „The Nilgirs‟, hills‟, is one of the smallest District of Tamil Nadu. Etymo',
 'logically the wo rd \n„Nilgiris‟ means, Blue Mountains‟. The district is a hill area of 2549.0 sq.kms. located between 11‟ 10‟ and 11‟30‟ N latitude \nand between 76‟25‟ and 77‟ 00‟E longitude at the junction of the Eastern and the Western Ghats, the two prominent mountain \nranges that run almost parallel to the coastline of peninsular India. With an average elevation of 6500 ft. the Nilgiris Dist rict is \nbound on the west by Kerala on the north by Karnataka, and on the So

# Embed the text into embeddings

In [8]:
import os
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    base_url=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY")
)

In [9]:
from langchain_core.vectorstores import InMemoryVectorStore

vectorstore = InMemoryVectorStore(
    embedding=embeddings
)

In [10]:
from langchain_core.documents import Document

documents = []
for chunk in chunks:
    documents.append(Document(chunk))

In [11]:
vectorstore.add_documents(documents)

['6cbe5d2b-366e-439e-9621-4ffa60eca6b2',
 '7d136046-39dd-439a-9af8-fd21af09fdcd',
 'b9fbe739-4fef-4c29-aaa7-03f70e61ccf9',
 'ba38d9b3-37c9-4627-a43f-311605270d64',
 '4b1087fd-3878-4074-b16a-b90a6a1cb170',
 '5df903a3-9982-42ca-bd27-ca7d5bec8c77',
 '2ce048e6-fe6d-4a23-add4-00ebc6bc652e',
 '18ae5282-613f-4618-8411-1f72c9f1f47f',
 '8e1f8f75-c20f-4afe-a437-3c09c9429099',
 '6ed68fe4-f291-40a0-90ef-8af2e75c88d4',
 '6aefd9c0-2624-47d1-a1cd-bb05ec284fcc',
 '5ed0371e-9319-4b21-b892-1d3dc1e54a4a',
 '56df1e78-c3b7-42d3-af5a-e7990697ed6c',
 '89401bd9-130f-44d1-90cc-3862f352f796',
 '285e8e5d-0e6f-4066-b351-9f83b753f44f',
 '2cdfc0ac-5768-4520-a9b7-6c76b2328419',
 '6a9a81d4-49b8-45fd-92eb-36f7b838aa93',
 '62c6245a-8bc5-471c-b9bf-12d76f9d31b0',
 '2bc866fa-d688-4b58-89fb-efe449628437',
 '0fcca554-d3b2-4bbd-aedb-6b2e5b248dfd',
 '16eccff5-2a28-4828-b612-c97cd7195dac',
 'fd564b49-c31c-472e-8ea3-595de78161ab',
 'f40f7693-f8f3-41d8-97f2-7bf2f9cc76e9',
 'd9e05b0a-8d1d-44d8-8ec2-0049ac40569f',
 'c3b196ab-b83b-

In [12]:
retriever = vectorstore.as_retriever()

retriever.invoke("Who are badagas")

[Document(id='f40f7693-f8f3-41d8-97f2-7bf2f9cc76e9', metadata={}, page_content='supposed to have migrated to the Nilgiris from \nMysore. \n[2] (Breeks J.W.,1873:4) \n3. The Badagas or Vadagas are supposed to have come \nfrom the north.[6](Hunter W.W.,1886:67,310 \n4. The word Badaga means “people of the  North and \nhence it is supposed that they have  come originally \nfrom the North , probably the northern part of Mysore \nand Canara.[12] (Sheering M.A.,1974:171) \n5. We are thus left with the presumption that the Badaga \nare so called because of their Northern or igin. These \nvie'),
 Document(id='bad6ab2d-7cfe-404c-bac1-1ebe4693a37e', metadata={}, page_content='jati , Girijans and \nunadvanced are also used to specify the tribe. But all the \nimpressions accept some common features of Badugas are \nas follows: \n1. Definite Geographical te rritory: they had been living \nin the Nilgiri and no other habitat out side its territory \n2. common name: Badugar is the common name of thei

# Talk to LLM with RAG information

In [13]:
from langchain.chat_models import init_chat_model

model = init_chat_model(
    model="gpt-4.1-mini",
    base_url=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_KEY")
)

In [14]:
system_prompt = """
You are an helpful assiatant. Use the "INFORMATION" provided along with the "QUERY" to answer the query. Ground your answers to the provided information and previous history.
If you could not find defenitive information tell there is not enough info on the provided documents.
"""


In [15]:
from langchain.agents import create_agent


agent = create_agent(
    model=model,
    tools=[],
    system_prompt=system_prompt
)

In [16]:
def read_data_from_documents(documents):
    data = ""
    for doc in documents:
        data += doc.page_content + "\n"
    
    return data

In [25]:
from langchain.messages import HumanMessage
def ask_llm(prompt):
    documents = retriever.invoke(prompt)
    rag_data = ""
    rag_data += read_data_from_documents(documents)

    # print("RAG Data: ", rag_data)
    augmented_prompt = f"""
    Information: 
    {rag_data}

    Query:
    {prompt}
    """
    # print("Augmented Prompt: ", augmented_prompt)
    response = agent.invoke(
        {"messages":[HumanMessage(augmented_prompt)]}, 
        {"configurable": {"thread_id": "2"}}
        )

    return response

In [27]:
while True:
    prompt = input("Enter your query: ")
    print("You: ", prompt, end="\n\n")
    response = ask_llm(prompt)
    print("AI: ", response["messages"][-1].content, end="\n\n")
    print("====================================================================================")

You:  Hi

AI:  Hello! How can I assist you today? If you have any questions or need information based on the provided document about the indigenous Badugar of the Nilgiris, feel free to ask.

You:  Who are Badagas?

AI:  The Badagas, also known as Badugars, are an indigenous community primarily living in the Nilgiris district. Traditionally, they were pastor-swiddeners and spoke the Badugu language, which is rich in idioms, proverbs, folk songs, legends, and more. Their population was recorded as 34,152 in 1908, making them the most populous people in the Nilgiri district.

There has been a historical belief that the Badagas migrated from the north, possibly from the northern part of Mysore and Canara, as the word "Badaga" was thought to mean "people of the North" in Kannada. However, this theory of them being northerners or migrants from Mysore is now considered old and unestablished. Presently, they are recognized as the community of the Great Badagas, indigenous to the Nilgiris.

Ke

KeyboardInterrupt: Interrupted by user