step 1: load all documents and create splits/chunks of it(FOR PDF)

In [4]:
from urllib.request import urlretrieve
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import numpy as np

# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./DataDocs/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')


Before split, there were 111 documents loaded, with average characters equal to 2906.
After split, there were 534 documents (chunks), with average characters equal to 603 (average chunk length).


step 1: load all documents and create splits/chunks of it(FOR CSV)

In [5]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import numpy as np

# Define a function to load CSV files from a directory
def load_csv_directory(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            # Convert each row of the dataframe to a document
            for index, row in df.iterrows():
                content = row.to_string()
                # Wrap the content in a Document object
                data.append(Document(page_content=content))
    return data

# Load CSV files in the local directory
directory = "./DataDocs/CSV/"
docs_before_split = load_csv_directory(directory)

# Split documents using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

# Function to calculate average document length
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs]) // len(docs)

# Calculate average characters before and after splitting
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

# Print the results
print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

# Display the first split document
print(docs_after_split[0].page_content)


Before split, there were 4350 documents loaded, with average characters equal to 106.
After split, there were 4350 documents (chunks), with average characters equal to 106 (average chunk length).
L D College of Engineering, Ahmedabad    Placement Status of Batch 2018-19


Step 2: Let's create embeddings of that chunks and store it in vecotor store

In [6]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="thenlper/gte-large",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)


  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Check and find data within vector store

In [60]:
query = """200280116082 IT 2024 RIVEREDGE"""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Branch Of Engineering             Year 2023 Year 2022 Year 2021 Year 2020 Year 2019 Year 2018 Year 2017 Year 2016   Branchwise Total Selection    Information Technology Engineering        128 ...


Now Let Make llm with help of HuggingFaceHub llms

In [56]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [61]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    repo_id="tiiuae/falcon-7b-instruct",
    model_kwargs={"temperature":0.1, "max_length":500}
)

query = """According to Placement Data 2024 RIDHAM CHAUHAN Placed in which company?"""
# llm.invoke(query)

prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

L.D. COLLEGE OF ENGINEERING-BATCH 2024    181   200280116082     CHAUHAN RIDHAM VIJAYKUM...

L.D. COLLEGE OF ENGINEERING-BATCH 2024    380   200280119054          CHAUHAN MEET MANOJ...

L.D. COLLEGE OF ENGINEERING-BATCH 2024    174   200280116010     CHAUHAN VAIBHAV PRABHUB...

Question: According to Placement Data 2024 RIDHAM CHAUHAN Placed in which company?

Helpful Answer:

Chauhan, Vaibhav, and Manoj are placed in different companies. Vaibhav is placed in TCS, Manoj is placed in Infosys, and Chauhan is placed in L&T.
