In [10]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import io
import re
from google import genai


load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [11]:
loader = DirectoryLoader(
    path="./dataset",
    glob="*.pdf",
    loader_cls=PyPDFLoader
)

documents = loader.load()

for doc in documents:
    doc.page_content = re.sub(r"\s+", " ", doc.page_content)
    # doc.page_content = doc.page_content.replace("\n", " ")
    
print(f"Loaded {len(documents)} documents")

Loaded 371 documents


In [12]:
print(f"Example document: {documents[0].page_content[:500]}...")

Example document: POLITICS,PHILOSOPHY& ECONOMICSHANDBOOK ASHOKAUNIVERSITY This guide was made bythe PPERepresentatives inconsultationwithFacultymembers. We acknowledge thework of the previous representatives in setting up the handbook, this version of the Student Handbookis anadaptationofthepreviousversion DISCLAIMER: The information in this guide is meant to serve as a ready reference for students. Someinformationmaybecomebackdatedwithchangesincurriculum,policiesetc.Thus,youarerequestedtoconfirmthe latest update...


In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n", " "]
)

chunks = text_splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks")
print(f"Example chunk: {chunks[30].page_content}")

Split into 1467 chunks
Example chunk: Anyelective Note1:Thetwoelectivecoursesmustbeselectedfromdifferentcategories.SymbolicLogic countsasanelective Note2:ThetwoelectivesmustbetaughtbythePhilosophyfacultyi.e.,coursescross-listed withthePhilosophydepartmenttaughtbyafacultymemberofanotherdepartmentwillnot counttowardstheserequirements. Note3:CoursesofferedbythePhilosophydepartmentinthepastcanbefoundhere, while thosecurrentlybeingofferedandthosesoontobeofferedcanbefoundhere. Pages12of16


In [14]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

vectorstore = FAISS.from_documents(chunks, embeddings)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [15]:
ret = retriever.get_relevant_documents("Who is the Chemistry representative?")

for doc in ret:
    print(f"Source: {doc.metadata['source']}, Text: {doc.page_content[:300]}...")

Source: dataset/Chemistry Student Handbook M23.pdf, Text: Frequently Asked Questions Youcandirectyourqueriestothefollowingpersonsofcontact: ChemistryStudentRepresentative, :chem.rep@ashoka.edu.inPrashuchi Pandey UGProgrammeCoordinator,Dr. :aryya.ghosh@ashoka.edu.inAryya Ghosh Head,DepartmentofChemistry, Dr. :hod.che@ashoka.edu.inSourav Pal Canlaboratorycou...
Source: dataset/Chemistry Student Handbook M23.pdf, Text: Table of Contents ChemistryatAshoka 1TheUndergraduateMajor 3Electives 4Minor&Concentration 5CourseTimeline 5FrequentlyAskedQuestions 7 1...
Source: dataset/Chemistry Student Handbook M23.pdf, Text: Chemistry Department Student HandbookUpdatedMonsoon2023...


In [16]:
template = """
You are an AI assistant providing helpful, accurate, and concise responses based on the given context.

Context: {context}

Question: {question}

Answer the question using only the provided context. If the context doesn't contain the answer, say "I don't have enough information to answer this question."
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)


llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0)
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
)


In [17]:

def ask_rag(question):
    response = rag_chain.invoke({"query": question})
    return response["result"]


questions = [
    "How many credits do i need for a major in Computer Science?",
    "Who is the chemistry representative?",
    "What kinds of courses are offered by the IR department?",
    "Who can I contact in the Computer Science Department?",
    "What is their email address?"
]

for question in questions:

    print(f"Question: {question}")
    answer = ask_rag(question)
    print(f"Answer: {answer}")

    
    
    source_docs = retriever.get_relevant_documents(question)
    print("\nSource documents:")
    for i, doc in enumerate(source_docs, 1):
        print(f"{i}. Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"   Excerpt: {doc.page_content[:500]}...\n")
    print("-" * 50)

Question: How many credits do i need for a major in Computer Science?
Answer: To major in Computer Science, you need the following:

*   Foundation and Critical Thinking courses (36 credits)
*   Co-Curricular courses (4 credits)
*   Computer Science Major courses:
    *   10 compulsory core courses (40 credits)
    *   A minimum of 20 credits of electives

Therefore, you need a total of 36 + 4 + 40 + 20 = 100 credits for a major in Computer Science.

Source documents:
1. Source: dataset/CS Handbook 2023.pdf
   Excerpt: CSMinor RequirementsIn order to get a Minor in Computer Science, students are required totake24credits madeupas follows: 1. IntroductiontoComputer Programming(4credits)2. 3CScorecourses (3x 4= 12credits)3. Any 2CScourses (2x 2= 8credits) Interdisciplinary MajorsIn order to Major in an interdisciplinary program, students must accumulate 116credit pointsinthreeyears. TheComputer Sciencedepartment oﬀers threeinterdisciplinary Majors ● Computer ScienceandMathematics● Compute

In [18]:
questions = [
    "How many credits do i need for a major in Computer Science?",
    "Who is the chemistry representative and how do I contact them?",
    "What kinds of courses are offered by the IR department?"
]

for question in questions:

    print(f"Question: {question}")
    answer = ask_rag(question)
    print(f"Answer: {answer}")

    source_docs = retriever.get_relevant_documents(question)
    print("\nSource documents:")
    for i, doc in enumerate(source_docs, 1):
        print(f"{i}. Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"   Excerpt: {doc.page_content[:500]}...\n")
    print("-" * 50)

Question: How many credits do i need for a major in Computer Science?
Answer: To major in Computer Science, you need the following:

*   Foundation and Critical Thinking courses (36 credits)
*   Co-Curricular courses (4 credits)
*   Computer Science Major courses:
    *   10 compulsory core courses (40 credits)
    *   A minimum of 20 credits of electives

Therefore, you need a total of 36 + 4 + 40 + 20 = 100 credits for a major in Computer Science.

Source documents:
1. Source: dataset/CS Handbook 2023.pdf
   Excerpt: CSMinor RequirementsIn order to get a Minor in Computer Science, students are required totake24credits madeupas follows: 1. IntroductiontoComputer Programming(4credits)2. 3CScorecourses (3x 4= 12credits)3. Any 2CScourses (2x 2= 8credits) Interdisciplinary MajorsIn order to Major in an interdisciplinary program, students must accumulate 116credit pointsinthreeyears. TheComputer Sciencedepartment oﬀers threeinterdisciplinary Majors ● Computer ScienceandMathematics● Compute

In [None]:
ask_rag("who are the cs representatives?")

'Yes, if a course is cross-listed and fulfills requirements for both your major and minor, it can be double or triple counted, thereby fulfilling major/minor requirements of up to 3 departments. You receive only 4 credits for the course, but it will contribute towards both the major and minor requirements.'