# Loading from Google Drive

In [21]:
from llama_index.readers.google import GoogleDriveReader
loader = GoogleDriveReader()
k=0

In [22]:
from dotenv import load_dotenv, dotenv_values
config = dotenv_values('.env')


In [23]:
def load_data(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    return docs

In [91]:
if k==0:
    print("K=0")
    docs = load_data(folder_id=config['FOLDER_ID'])
    k=1
else:
    print("K=1")
    all_docs = load_data(folder_id=config['FOLDER_ID'])
if k==1:
    old_file_id = set()
    for i in docs:
        old_file_id.add(i.id_)
    new_file_id = set()
    for i in all_docs:
        new_file_id.add(i.id_)
    unique_file_id = new_file_id.symmetric_difference(old_file_id)
    new_docs = []
    for i in unique_file_id:
        for j in all_docs:
            if i == j.id_:
                new_docs.append(j)
    docs = all_docs

K=1




# Prompt Engineering

In [69]:
from llama_index.core.prompts.prompts import SimpleInputPrompt
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as posssible based on the instruction and context provided.
We have provided context information below.
Given this information, please answer the question:"""

## Default format supported by Llama2
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Google Gemini

In [70]:
import os

os.environ["GOOGLE_API_KEY"] = config['GOOGLE_API_KEY']

In [71]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
llm = Gemini()
llm.system_prompt=system_prompt
llm.query_wrapper_prompt=query_wrapper_prompt
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [72]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size=1024
Settings.chunk_overlap=32

In [73]:
PERSIST_DIR = "./storage"

In [13]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
import nest_asyncio
import asyncio

nest_asyncio.apply()

In [116]:
from llama_index.core.extractors import (
    TitleExtractor, QuestionsAnsweredExtractor
)
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = SentenceSplitter(separator="\n",
    chunk_size=1024,
    chunk_overlap=20,
)

title_extractor = TitleExtractor(nodes=5)

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=[text_splitter, title_extractor])


In [117]:
#Initializing the PERSISTENT DIRECTORY path
PERSIST_DIR = "./storage"
#Conditional statements to check if the Directory exists or not
if not os.path.exists(PERSIST_DIR):
    # Converting the nodes into indexes
    nodes = pipeline.run(documents=docs, in_place=True, show_progress=True)
    index = VectorStoreIndex(nodes,show_progress=True)
    # If Directory does not exist then create one and store the index
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("Indexing done successfully")
else:
    # Reloading the index. If any new file gets uploaded in the Google Drive Folder then the file can be indexed
    # index = VectorStoreIndex(nodes, show_progress=True)
    #storing the reloaded index
    # index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("Indexing running successfully")
    # Loading the index from PERSIST_DIR
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    new_nodes = pipeline.run(documents=new_docs, in_place=True, show_progress=True)
    index.insert_nodes(new_nodes)

Indexing running successfully


Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 991.68it/s]
100%|██████████| 2/2 [00:03<00:00,  1.61s/it]


In [44]:
#load the document and create the index
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model, transformations=[title_extractor, qa_extractor])
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
index = VectorStoreIndex(nodes)
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model, transformations=[title_extractor])
#store for it for later
index.storage_context.persist(persist_dir=PERSIST_DIR)

#load the existing index
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

In [15]:
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

In [107]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x25fd5fda4d0>

In [17]:
refreshed_docs = index.refresh_ref_docs(docs)

In [18]:
refreshed_docs

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [118]:
query_engine = index.as_query_engine()

In [119]:
res = query_engine.query("What is Intelligent Document Finder?")

In [120]:
res

Response(response='Intelligent Document Finder is a project that aims to create a seamless, user-friendly platform for uploading and automatically indexing various document formats, including PDFs, PPTs, Word documents, and other forms of unstructured data.', source_nodes=[NodeWithScore(node=TextNode(id_='9ab70df2-22d8-494c-8f76-5b7ee462738e', embedding=None, metadata={'page_label': '2', 'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf', 'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3', 'author': 'Sourav Biswas', 'file name': 'Project Task 23rd Feb.pdf', 'mime type': 'application/pdf', 'created at': '2024-02-29T06:53:19.717Z', 'modified at': '2024-02-26T07:46:27.000Z', 'document_title': 'Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name

In [121]:
res.metadata

{'9ab70df2-22d8-494c-8f76-5b7ee462738e': {'page_label': '2',
  'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
  'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
  'author': 'Sourav Biswas',
  'file name': 'Project Task 23rd Feb.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-29T06:53:19.717Z',
  'modified at': '2024-02-26T07:46:27.000Z',
  'document_title': 'Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval'},
 '20a5030f-9b4e-4667-8d3a-4849d323a736': {'page_label': '2',
  'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
  'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
  'author': 'Sourav Biswas',
  'file name': 'Project Task 23rd Feb.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-29T06:53:19.717Z',
  'modified at': '2024-02-26T07:46:27.000Z',
  'document_title': 'Intelligent Document Finder with Llama Index: Seamless D

In [122]:
metadata=res.source_nodes[0].node.metadata

In [123]:
metadata

{'page_label': '2',
 'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
 'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
 'author': 'Sourav Biswas',
 'file name': 'Project Task 23rd Feb.pdf',
 'mime type': 'application/pdf',
 'created at': '2024-02-29T06:53:19.717Z',
 'modified at': '2024-02-26T07:46:27.000Z',
 'document_title': 'Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval'}

In [124]:
print("Sources :")
print("File Name -",metadata['file name'])
print("Title -",metadata['document_title'])
print("Page number -",metadata['page_label'])
print(metadata['author'])

Sources :
File Name - Project Task 23rd Feb.pdf
Title - Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval
Page number - 2
Sourav Biswas


In [88]:
type(res.source_nodes[1].node.metadata)

dict

In [54]:
page = res.source_nodes[0].node.metadata['page_label']
filename = res.source_nodes[0].node.metadata['file name']
author = res.source_nodes[0].node.metadata['author']

In [55]:
print(page,filename)

16 nlp.pdf


In [89]:
meta_data="\n"
meta_data=meta_data+"File Name - "+metadata['file name']+"\n"
meta_data=meta_data+"Title - "+metadata['document_title']+"\n"
meta_data=meta_data+"Page number - "+metadata['page_label']+"\n"

In [90]:
print(meta_data)


File Name - nlp.pdf
Title - Natural Language Processing: A Comprehensive Overview
Page number - 16

