# Loading from Google Drive

In [1]:
from llama_index.readers.google import GoogleDriveReader
loader = GoogleDriveReader()

  from .autonotebook import tqdm as notebook_tqdm


In [161]:
def load_data(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    # for doc in docs:
    #     doc.id_ = doc.metadata["file_name"]
    return docs

In [162]:
from dotenv import load_dotenv, dotenv_values
config = dotenv_values('.env')


In [163]:
config['FOLDER_ID']

'1cxqK_bHH5qdUrWSP8nx1s6_EwtFhcFRa'

In [164]:
docs = load_data(folder_id=config['FOLDER_ID'])



In [165]:
docs[0].text

'Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring signiﬁcantly\nless tim

In [166]:
docs

[Document(id_='1EjdirSpyG7l2PLn81lLMlgQW3Brqd_NX', embedding=None, metadata={'page_label': '1', 'file_name': 'C:\\Users\\promact\\AppData\\Local\\Temp\\tmpu4ozyjy9\\attention.pdf', 'file id': '1EjdirSpyG7l2PLn81lLMlgQW3Brqd_NX', 'author': 'Sourav Biswas', 'file name': 'attention.pdf', 'mime type': 'application/pdf', 'created at': '2024-02-23T04:50:29.772Z', 'modified at': '2024-02-20T08:50:12.000Z'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗†\nUniversity of Toronto\naidan

# Prompt Engineering

In [178]:
from llama_index.core.prompts.prompts import SimpleInputPrompt
system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as posssible based on the instruction and context provided
"""

## Default format supported by Llama2
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# Google Gemini

In [179]:
import os

os.environ["GOOGLE_API_KEY"] = config['GOOGLE_API_KEY']

In [180]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# import google.generativeai as genai
# from langchain_community.embeddings import GooglePalmEmbeddings
# llm = genai.GenerativeModel('gemini-pro')
# embed_model = GooglePalmEmbeddings(google_api_key=GOOGLE_API_KEY)

# model = genai.GenerativeModel('gemini-pro')
llm = Gemini()
llm.system_prompt=system_prompt
llm.query_wrapper_prompt=query_wrapper_prompt
# embed_model = GeminiEmbedding(model_name="models/embedding-001", api_key=GOOGLE_API_KEY)
embed_model = embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [181]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size=1024

In [182]:
PERSIST_DIR = "./storage"

In [199]:
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
import asyncio

In [202]:
#load the document and create the index
index = VectorStoreIndex.from_documents(docs, embed_model=embed_model)
#store for it for later
index.storage_context.persist(persist_dir=PERSIST_DIR)

#load the existing index
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)

In [185]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x173849d3f90>

In [186]:
query_engine = index.as_query_engine()

In [187]:
res = query_engine.query("What is Natural Language Processing?")

In [190]:
metadata=res.source_nodes[0].node.metadata

In [192]:
meta_data="The above answer has been extracted from "
if metadata:
    meta_data=meta_data+" page " + metadata['page_label'] + " of "+f"{metadata['file name']} file which is uploaded in the Google Drive folder of '"+ metadata['author']+ "' Google Account"
else:
    meta_data="No metadata found"
print(meta_data)

The above answer has been extracted from  page 16 of nlp.pdf file which is uploaded in the Google Drive folder of 'Sourav Biswas' Google Account


In [142]:
type(res.source_nodes[1].node.metadata)

dict

In [137]:
page = res.source_nodes[0].node.metadata['page_label']
filename = res.source_nodes[0].node.metadata['file name']
author = res.source_nodes[0].node.metadata['author']

In [138]:
print(page,filename)

16 nlp.pdf
