In [1]:
import os
import textwrap
import glob
from pathlib import Path
from IPython.display import Markdown
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferWindowMemory
from llama_parse import LlamaParse
# from langchain_openai import OpenAI
from langchain.chains import ConversationChain

os.environ["GROQ_API_KEY"] = "mygroqapi"
os.environ["LLAMA_CLOUD_API_KEY"] = "llamakey"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# import os.path
# parser = LlamaParse(
#     result_type="markdown",
#     # parsing_instruction=instruction,
#     max_timeout=89000,
# )
# pdf_directory = './data'

# markdown_directory = './data'

# qdrant_db_path = './db'

# all_docs = []

# pdf_files = glob.glob(os.path.join(pdf_directory, '*.pdf'))

In [3]:
markdown_directory = './data'

chroma_db_path = "./chroma_db"


all_docs = []

markdown_files = glob.glob(os.path.join(markdown_directory, '*.md'))

In [None]:
# import asyncio
# import nest_asyncio
# nest_asyncio.apply()
# from httpx import ReadTimeout

# # for 5 tries
# max_retries = 5
# retry_delay = 3  # in seconds

# async def process_pdf(pdf_file):
#     retries = 0
#     while retries < max_retries:
#         try:
#             print(f"Processing file: {pdf_file} (Attempt {retries + 1}/{max_retries})")

#             markdown_content = await parser.aload_data(pdf_file)
#             print(f"Successfully parsed PDF: {pdf_file}")

#             markdown_file = os.path.join(markdown_directory, os.path.basename(pdf_file).replace('.pdf', '.md'))
#             with open(markdown_file, "w", encoding='utf-8') as md_file:  # Changed "a" to "w" to overwrite if exists
#                 md_file.write(markdown_content[0].text)
#             print(f"Markdown file saved: {markdown_file}")

#             loader = UnstructuredMarkdownLoader(markdown_file)
#             loaded_documents = loader.load()
#             print(f"Loaded documents from markdown file: {markdown_file}")

#             text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
#             docs = text_splitter.split_documents(loaded_documents)
#             print(f"Successfully split documents for file: {pdf_file}")

#             return docs

#         except ReadTimeout:
#             print(f"ReadTimeout occurred while processing {pdf_file}. Retrying {retries + 1}/{max_retries}...")
#             retries += 1
#             await asyncio.sleep(retry_delay)
#         except Exception as e:
#             print(f"An error occurred while processing {pdf_file}: {e}")
#             retries += 1
#             await asyncio.sleep(retry_delay)

#     print(f"Failed to process {pdf_file} after {max_retries} retries.")
#     return []

# if asyncio.get_event_loop().is_running():
#     tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
#     results = await asyncio.gather(*tasks)
# else:
#     tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
#     results = asyncio.run(asyncio.gather(*tasks))

# for result in results:
#     all_docs.extend(result)

In [5]:
import asyncio
import nest_asyncio
nest_asyncio.apply()
from httpx import ReadTimeout


import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)


# for 5 tries
max_retries = 5
retry_delay = 3  # in seconds


async def process_md(markdown_file):
    retries = 0
    while retries < max_retries:
        try:
            print(f"Processing file: {markdown_file} (Attempt {retries + 1}/{max_retries})")

            loader = UnstructuredMarkdownLoader(markdown_file)
            loaded_documents = loader.load()
            print(f"Loaded documents from markdown file: {markdown_file}")

            text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30, length_function=tiktoken_len, separators=[' ', ''])
            docs = text_splitter.split_documents(loaded_documents)
            print(f"Successfully split documents for file: {markdown_file}")

            return docs

        except ReadTimeout:
            print(f"ReadTimeout occurred while processing {markdown_file}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            await asyncio.sleep(retry_delay)
        except Exception as e:
            print(f"An error occurred while processing {markdown_file}: {e}")
            retries += 1
            await asyncio.sleep(retry_delay)

    print(f"Failed to process {markdown_file} after {max_retries} retries.")
    return []

if asyncio.get_event_loop().is_running():
    tasks = [process_md(markdown_file) for markdown_file in markdown_files]
    results = await asyncio.gather(*tasks)
else:
    tasks = [process_md(markdown_file) for markdown_file in markdown_files]
    results = asyncio.run(asyncio.gather(*tasks))

for result in results:
    all_docs.extend(result)

Processing file: ./data\B.Tech Civil Engineering_2023.md (Attempt 1/5)
Loaded documents from markdown file: ./data\B.Tech Civil Engineering_2023.md
Successfully split documents for file: ./data\B.Tech Civil Engineering_2023.md
Processing file: ./data\B.Tech CSE_2023.md (Attempt 1/5)
Loaded documents from markdown file: ./data\B.Tech CSE_2023.md
Successfully split documents for file: ./data\B.Tech CSE_2023.md
Processing file: ./data\B.Tech Mechanical course structure_2023.md (Attempt 1/5)
Loaded documents from markdown file: ./data\B.Tech Mechanical course structure_2023.md
Successfully split documents for file: ./data\B.Tech Mechanical course structure_2023.md
Processing file: ./data\B.TechEngg.Physics_0.md (Attempt 1/5)
Loaded documents from markdown file: ./data\B.TechEngg.Physics_0.md
Successfully split documents for file: ./data\B.TechEngg.Physics_0.md
Processing file: ./data\B.TechinDataScience.md (Attempt 1/5)
Loaded documents from markdown file: ./data\B.TechinDataScience.md
Suc

In [2]:
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


In [3]:
qdrant = Qdrant.from_documents(
all_docs,
embeddings,
# location=":memory:",
path="./db",
collection_name="document_embeddings",
)
# else:
qdrant = Qdrant.from_existing_collection(
    embeddings,
    # location=":memory:",
    path="./db",
    collection_name="document_embeddings",
    )

In [4]:
retriever = qdrant.as_retriever(search_kwargs={"k": 12})
compressor = FlashrankRerank(model="ms-marco-MiniLM-L-12-v2")
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [5]:
llm = ChatGroq(temperature = 0.1, model_name="llama3-70b-8192")

In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

output_parser = StrOutputParser()

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

instruction_to_system = """
formulate it as a detailed standalone question
DO not include the phrase IIT Mandi or "Indian Institute of Technology, Mandi"in your final output question, simply trim that part out of your question
Do NOT answer the question,
for example, If the question is "what is the design club of IIT mandi" your output should be "What is the design club, tell me about its activities." 
just reformulate it if needed and otherwise return it as is.
Focus on the subject of the question more than the object.
"""

question_maker_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", instruction_to_system),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}")
    ]
)

question_chain = question_maker_prompt | llm | StrOutputParser()

In [7]:
from langchain_core.messages import HumanMessage
question_chain.invoke({"question":"are their any more similar clubs?", "chat_history": [HumanMessage(content="tell me about design club of IIT Mandi")]})

'Are there any other clubs similar to a design club, and what kind of activities do they usually engage in?'

In [8]:
qa_system_prompt = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that this is something I am not equipped to advice you on, don't try to make up an answer.

Context: {context}
If no data is available for a given question, say I do not have information about your question currently, for more details visit IIT Mandi official website.

Always try to give, to the point and relevant answers STRICTLY LESS THAN 500 WORDS EVERYTIME.

Try to give all the numbers, facts and figures mentioned in the supplied context but in a human readable, easy to read, PARAGRAPHICAL WELL FORMATTED LAYOUT.

REMEMBER DO NOT GIVE A PLACEHOLDER IF THE CONTEXT HAS NO LINK.

THERE SHOULD BE A CHRONOLOGY and a sense of continuity WITHIN THE RESPONSE. MOST RELEVANT POINTS SHOULD BE AT THE TOP.
Add personlization, give an answer as if you are directly talking to the person.

Try to answer in bullet points wherever necessary.
"""


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


In [9]:
def contextualized_question(input: dict):
    return question_chain

In [10]:
from langchain_core.runnables import RunnablePassthrough
retriever_chain = RunnablePassthrough.assign(
        context=contextualized_question | compression_retriever 
    )

In [11]:
retriever_chain.invoke({
    "question": "DSA is of how many credits?" , "chat_history": [HumanMessage(content="")]
    })

Running pairwise ranking..


{'question': 'DSA is of how many credits?',
 'chat_history': [HumanMessage(content='')],
 'context': [Document(page_content="Shortest Paths using Floyd Warshall, Maximum Flow (Ford Fulkerson)\n\nAdvanced Data Structures: Quake heaps, van Emde Boas Trees, Union Find Data structures\n\nComputational complexity: Problem classes: P; NP; NP-complete, NP-hard. Reduction. Cook's theorem; Examples of NP-complete problems\n\nApproximation Algorithms - Greedy and Local Search algorithms; DP Algorithms\n\nChapter 2. Course Structure, Credit System & Evaluation\n\n2.2. Credit System\n\nEach course has a certain number of credit(s) assigned to it depending upon its lecture, tutorial and laboratory/practical contact hours in a week. This weightage is also indicative of the academic expectation that includes in-class contact and self-study outside class hours.\n\nLectures and Tutorials:\n\n1 credit 1 contact-hour per week (50 minutes) 2-3 hours expected self-study time outside class for every contact

In [12]:
rag_chain = (
    retriever_chain
    | qa_prompt
    | llm
    | output_parser
)

In [13]:
chat_history = []

In [14]:
# ENTER YOUR QUESTION HERE
question = """who is the director of IIT Mandi?
"""

ai_msg = rag_chain.invoke({"question": question, "chat_history" : chat_history})
print()
print(ai_msg)
print()
chat_history.clear
chat_history.extend([HumanMessage(content = question), ai_msg])

Running pairwise ranking..

Hello! According to the information I have, the Director of IIT Mandi is Prof. Laxmidhar Behera.



In [23]:
# ENTER YOUR QUESTION HERE
question = """I am joining IIT mandi soon, things to keep in mind.
"""
ai_msg = rag_chain.invoke({"question": question, "chat_history" : chat_history})
print()
print(ai_msg)
print()
chat_history.clear
chat_history.extend([HumanMessage(content = question), ai_msg])

Running pairwise ranking..

Congratulations on joining IIT Mandi! Here are a few things to keep in mind as you start your journey:

* The institute offers a wide range of resources to explore, including academic and non-academic activities, cultural, technical, literary, and sports facilities.
* You'll have access to unique activities like hiking, trekking, and mountain biking.
* Make sure to use every resource carefully and with a sense of responsibility.
* The institute aims to produce all-rounder technocrats with exposure to both academic and non-academic activities.
* You'll find a comprehensive handbook that serves as a guide to the campus, including information on available facilities, hostels, canteens, and the student gymkhana.
* The handbook also includes frequently asked questions, details on many issues you may encounter, and useful contact details of key people.

Additionally, don't forget to explore the various student societies and clubs, such as the Research Council, IEE

In [20]:
# ENTER YOUR QUESTION HERE
question = """how many total credits need to be done in B.tech IIT mandi per semester
"""

ai_msg = rag_chain.invoke({"question": question,"chat_history":  [HumanMessage(content=chat_history[0])]})

print()
print(ai_msg.content)
print()

ValidationError: 2 validation errors for HumanMessage
content
  str type expected (type=type_error.str)
content
  value is not a valid list (type=type_error.list)