# Loading from Google Drive

In [1]:
from llama_index.readers.google import GoogleDriveReader
loader = GoogleDriveReader()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv, dotenv_values
config = dotenv_values('.env')


In [6]:
docs = loader.load_data(mime_types=["application/pdf"])

An error occurred while loading with fileid: 'NoneType' object is not iterable


In [7]:
def load_data(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    return docs

In [9]:
docs = load_data(folder_id=config['FOLDER_ID'])

KeyError: 'FOLDER_ID'

In [91]:
if k==0:
    print("K=0")
    docs = load_data(folder_id=config['FOLDER_ID'])
    k=1
else:
    print("K=1")
    all_docs = load_data(folder_id=config['FOLDER_ID'])
if k==1:
    old_file_id = set()
    for i in docs:
        old_file_id.add(i.id_)
    new_file_id = set()
    for i in all_docs:
        new_file_id.add(i.id_)
    unique_file_id = new_file_id.symmetric_difference(old_file_id)
    new_docs = []
    for i in unique_file_id:
        for j in all_docs:
            if i == j.id_:
                new_docs.append(j)
    docs = all_docs

K=1




# Prompt Engineering

In [69]:
from llama_index.core.prompts.prompts import SimpleInputPrompt
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as posssible based on the instruction and context provided.
We have provided context information below.
Given this information, please answer the question:"""

## Default format supported by Llama2
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Google Gemini

In [70]:
import os

os.environ["GOOGLE_API_KEY"] = config['GOOGLE_API_KEY']

In [71]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
llm = Gemini()
llm.system_prompt=system_prompt
llm.query_wrapper_prompt=query_wrapper_prompt
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [132]:
new_data = ["a","b"]
if new_data:
    print("Correct")

Correct


In [72]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size=1024
Settings.chunk_overlap=32

In [73]:
PERSIST_DIR = "./storage"

In [13]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
import nest_asyncio
import asyncio

nest_asyncio.apply()

In [116]:
from llama_index.core.extractors import (
    TitleExtractor, QuestionsAnsweredExtractor
)
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = SentenceSplitter(separator="\n",
    chunk_size=1024,
    chunk_overlap=20,
)

title_extractor = TitleExtractor(nodes=5)

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=[text_splitter, title_extractor])


In [117]:
#Initializing the PERSISTENT DIRECTORY path
PERSIST_DIR = "./storage"
#Conditional statements to check if the Directory exists or not
if not os.path.exists(PERSIST_DIR):
    # Converting the nodes into indexes
    nodes = pipeline.run(documents=docs, in_place=True, show_progress=True)
    index = VectorStoreIndex(nodes,show_progress=True)
    # If Directory does not exist then create one and store the index
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("Indexing done successfully")
else:
    # Reloading the index. If any new file gets uploaded in the Google Drive Folder then the file can be indexed
    # index = VectorStoreIndex(nodes, show_progress=True)
    #storing the reloaded index
    # index.storage_context.persist(persist_dir=PERSIST_DIR)
    print("Indexing running successfully")
    # Loading the index from PERSIST_DIR
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    new_nodes = pipeline.run(documents=new_docs, in_place=True, show_progress=True)
    index.insert_nodes(new_nodes)

Indexing running successfully


Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 991.68it/s]
100%|██████████| 2/2 [00:03<00:00,  1.61s/it]


In [44]:
#load the document and create the index
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model, transformations=[title_extractor, qa_extractor])
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
index = VectorStoreIndex(nodes)
#index = VectorStoreIndex.from_documents(docs, embed_model=embed_model, transformations=[title_extractor])
#store for it for later
index.storage_context.persist(persist_dir=PERSIST_DIR)

#load the existing index
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

In [15]:
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

In [107]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x25fd5fda4d0>

In [14]:
a= set()
b= set()
a.add(1)
a.add(2)
b.add(3)
b.add(4)
b.add(5)
unique_file=a.symmetric_difference(b)

In [15]:
a

{1, 2}

In [16]:
b

{3, 4, 5}

In [20]:
a=a.union(b)

In [18]:
b

{3, 4, 5}

In [21]:
a

{1, 2, 3, 4, 5}

In [2]:
unique_file

{3}

In [None]:
output.

In [118]:
query_engine = index.as_query_engine()

In [125]:
res = query_engine.query("What is YOLO?")

In [126]:
res

Response(response='YOLO is a unified, real-time object detection system.', source_nodes=[NodeWithScore(node=TextNode(id_='4f325ba6-8207-4b00-bf85-52e03e9063ff', embedding=None, metadata={'page_label': '10', 'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpsoo7vp9v\\yolo.pdf', 'file id': '1WSP5WhoMzEhPSKuwcIGDAmyxcpbfzOWZ', 'author': 'Sourav Biswas', 'file name': 'yolo.pdf', 'mime type': 'application/pdf', 'created at': '2024-02-23T04:50:31.243Z', 'modified at': '2024-02-20T08:52:29.000Z', 'document_title': 'YOLO: A Unified, Real-Time Object Detection System'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1WSP5WhoMzEhPSKuwcIGDAmyxcpbfzOWZ', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_l

In [121]:
res.metadata

{'9ab70df2-22d8-494c-8f76-5b7ee462738e': {'page_label': '2',
  'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
  'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
  'author': 'Sourav Biswas',
  'file name': 'Project Task 23rd Feb.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-29T06:53:19.717Z',
  'modified at': '2024-02-26T07:46:27.000Z',
  'document_title': 'Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval'},
 '20a5030f-9b4e-4667-8d3a-4849d323a736': {'page_label': '2',
  'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
  'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
  'author': 'Sourav Biswas',
  'file name': 'Project Task 23rd Feb.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-29T06:53:19.717Z',
  'modified at': '2024-02-26T07:46:27.000Z',
  'document_title': 'Intelligent Document Finder with Llama Index: Seamless D

In [122]:
metadata=res.source_nodes[0].node.metadata

In [123]:
metadata

{'page_label': '2',
 'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpew1bm4gn\\Project Task 23rd Feb.pdf',
 'file id': '1Qa_DAmJcEsvUiACJkibOk13_8av_XfQ3',
 'author': 'Sourav Biswas',
 'file name': 'Project Task 23rd Feb.pdf',
 'mime type': 'application/pdf',
 'created at': '2024-02-29T06:53:19.717Z',
 'modified at': '2024-02-26T07:46:27.000Z',
 'document_title': 'Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval'}

In [124]:
print("Sources :")
print("File Name -",metadata['file name'])
print("Title -",metadata['document_title'])
print("Page number -",metadata['page_label'])
print(metadata['author'])

Sources :
File Name - Project Task 23rd Feb.pdf
Title - Intelligent Document Finder with Llama Index: Seamless Document Indexing and Retrieval
Page number - 2
Sourav Biswas


In [88]:
type(res.source_nodes[1].node.metadata)

dict

In [54]:
page = res.source_nodes[0].node.metadata['page_label']
filename = res.source_nodes[0].node.metadata['file name']
author = res.source_nodes[0].node.metadata['author']

In [55]:
print(page,filename)

16 nlp.pdf


In [89]:
meta_data="\n"
meta_data=meta_data+"File Name - "+metadata['file name']+"\n"
meta_data=meta_data+"Title - "+metadata['document_title']+"\n"
meta_data=meta_data+"Page number - "+metadata['page_label']+"\n"

In [90]:
print(meta_data)


File Name - nlp.pdf
Title - Natural Language Processing: A Comprehensive Overview
Page number - 16



In [2]:
import os
os.getenv("API")

# OneDrive Integration

In [10]:
pip install llama-index-readers-microsoft-onedrive

Collecting llama-index-readers-microsoft-onedriveNote: you may need to restart the kernel to use updated packages.

  Downloading llama_index_readers_microsoft_onedrive-0.1.3-py3-none-any.whl.metadata (7.2 kB)
Collecting msal<2.0.0,>=1.26.0 (from llama-index-readers-microsoft-onedrive)
  Downloading msal-1.27.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting cryptography<45,>=0.6 (from msal<2.0.0,>=1.26.0->llama-index-readers-microsoft-onedrive)
  Using cached cryptography-42.0.5-cp39-abi3-win_amd64.whl.metadata (5.4 kB)
Collecting cffi>=1.12 (from cryptography<45,>=0.6->msal<2.0.0,>=1.26.0->llama-index-readers-microsoft-onedrive)
  Using cached cffi-1.16.0-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.12->cryptography<45,>=0.6->msal<2.0.0,>=1.26.0->llama-index-readers-microsoft-onedrive)
  Using cached pycparser-2.21-py2.py3-none-any.whl.metadata (1.1 kB)
Downloading llama_index_readers_microsoft_onedrive-0.1.3-py3-none-any.whl (9.0 kB)
Downloading ms

In [47]:
from llama_index.readers.microsoft_onedrive import OneDriveReader

# User Authentication flow: Replace client id with your own id
loader = OneDriveReader(client_id="180534ca-f401-430c-8ba7-7fdd9dc56f56", tenant_id="e308fb86-a4c4-424f-9ca0-d94b1d9424e2", client_secret="4Ey8Q~kMVUUqwdQoPbjCbmND_J6WmWtfq2EqjbWb")

In [52]:
documents = loader.load_data(userprincipalname="souravbiswas19april@outlook.com")

An error occurred while loading the data: API request to download root failed with status code: 404, message: b'{"error":{"code":"ResourceNotFound","message":"User not found","innerError":{"date":"2024-03-19T06:45:40","request-id":"50b81f7d-9da2-4d24-99e0-68b40b128c3f","client-request-id":"50b81f7d-9da2-4d24-99e0-68b40b128c3f"}}}'
Traceback (most recent call last):
  File "d:\SOURAV\GenAI\IDF\Intelligent-Document-Finder-with-LlamaIndex\myenv\Lib\site-packages\llama_index\readers\microsoft_onedrive\base.py", line 493, in load_data
    self._downloaded_files_metadata = self._init_download_and_get_metadata(
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\SOURAV\GenAI\IDF\Intelligent-Document-Finder-with-LlamaIndex\myenv\Lib\site-packages\llama_index\readers\microsoft_onedrive\base.py", line 431, in _init_download_and_get_metadata
    root_folder_metadata = self._connect_download_and_return_metadata(
                           ^^^^^^^^^^^^^^^^^^^^^^^^

In [53]:
from llama_index.readers.microsoft_onedrive import OneDriveReader

# User Authentication flow: Replace client id with your own id
loader = OneDriveReader(client_id="82ee706e-2439-47fa-877a-95048ead9318")

# APP Authentication flow: NOT SUPPORTED By Microsoft

#### Get all documents including subfolders.
documents = loader.load_data()

In [61]:

from llama_index.readers.microsoft_onedrive import OneDriveReader

# User Authentication flow: Replace client id with your own id
loader = OneDriveReader(client_id="c876be5f-b00b-4755-9806-62635a04196d")


In [62]:

# APP Authentication flow: NOT SUPPORTED By Microsoft

#### Get all documents including subfolders.
documents = loader.load_data()

In [64]:
documents[0]

Document(id_='cf9a9637-415a-490f-8779-0d6b70c60b6e', embedding=None, metadata={'page_label': '1', 'file_name': 'nlp.pdf', 'file_id': '9A1D95F4E1C16B76!7801', 'created_by_user': 'Sourav Biswas', 'created_by_app': 'OneDrive', 'created_dateTime': '2024-03-18T07:32:23.15Z', 'last_modified_by_user': 'Sourav Biswas', 'last_modified_by_app': 'OneDrive', 'last_modified_datetime': '2024-03-18T20:57:53.18Z'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Natural Language Processing        \n   1 \n \nLanguage is a method of communication with the help of which we can  speak, read and \nwrite. For example, we think, we make decisions, plans and more in natural language ; \nprecisely,  in words. However,  the big question that confronts us in this AI era is that c

# Index Loading

In [4]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, Settings

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [6]:
def load_index():
        PERSIST_DIR = "./storage"
        Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
        Settings.chunk_size = 1024
        print("Index Loading Started...") # print statement before fetching Index
        # Loading the index from PERSIST_DIR
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        index = load_index_from_storage(storage_context)
        print("Index Loading Done")
        return index

In [9]:
index = load_index()

Index Loading Started...
Index Loading Done
