<a href="https://colab.research.google.com/github/shazhao57/Retrieve-Emails-Llama-index/blob/main/Vector_databases_Retrieve_email_using_secret_manager.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pip Installation

In [None]:
!pip install llama-index

In [None]:
!pip install google-cloud-secret-manager

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Retrieve emails

In [3]:
from google.colab import auth
from google.colab import drive

auth.authenticate_user()

In [4]:
from google.cloud import secretmanager_v1 as secretmanager
from google.auth import default as google_auth

# Replace the existing import statements
import googleapiclient.discovery
import googleapiclient.errors



In [5]:
"""Google Mail reader."""
import email
from typing import Any, List
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
from pydantic import BaseModel
import base64

SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]


class GmailReader(BaseReader, BaseModel):
    """Gmail reader.

    Reads emails

    Args:
        query (str): Gmail query. Defaults to None.
        max_results (int): Max number of results. Defaults to 10.
    """
    query: str = None
    use_iterative_parser: bool = False
    max_results: int = 10
    service: Any


    def load_data(
        self
    ) -> List[Document]:
        """Load emails from the user's account
        """
        from googleapiclient.discovery import build

        credentials = self._get_credentials()
        import json
        from google.oauth2.credentials import Credentials

        #credentials is a JSON string
        credentials_json = credentials

        # Load the JSON string into a dictionary
        credentials_dict = json.loads(credentials_json)

        #Create Google API credentials object
        credentials1 = Credentials.from_authorized_user_info(credentials_dict)


        # use authorized credentials object in the build_resource_service function
        #api_resource = build_resource_service(credentials = credentials1)

        if not self.service:
            self.service = build('gmail', 'v1', credentials=credentials1)

        messsages = self.search_messages()

        results = []
        for message in messsages:
            text = message.pop('body')
            #extra_info = message
            #results.append(Document(text, extra_info=extra_info))
            results.append(Document(text))

        return results

    def _get_credentials(self) -> Any:
        secret_name = 'token'
        project_id = 'vivid-kite-315522'

        # Create the Secret Manager client
        client = secretmanager.SecretManagerServiceClient()

        # Build the secret name path
        secret_path = f"projects/{project_id}/secrets/{secret_name}/versions/latest"

        # Access the secret version
        response = client.access_secret_version(request={"name": secret_path})

        # Retrieve the secret value
        secret_value = response.payload.data.decode("UTF-8")

        # Use the secret value as your credentials
        return secret_value

        # Retrieve the credentials from Secret Manager





    def search_messages(self):
        query = self.query

        max_results = self.max_results

        messages = self.service.users().messages().list(
            userId='me',
            q=query,
            maxResults=int(max_results)
        ).execute().get('messages', [])

        result = []
        try:
            for message in messages:
                message_data = self.get_message_data(message)
                if not message_data:
                    continue
                result.append(message_data)
        except Exception as e:
            raise Exception("Can't get message data" + str(e))

        return result

    def get_message_data(self, message):
        message_id = message['id']
        message_data = self.service.users().messages().get(
            format="raw",
            userId='me',
            id=message_id).execute()
        if self.use_iterative_parser:
            body = self.extract_message_body_iterative(message_data)
        else:
            body = self.extract_message_body(message_data)

        if not body:
            return None

        return {
            #"id": message_data['id'],
            #"threadId": message_data['threadId'],
            #"snippet": message_data['snippet'],
            "body": body,
        }

    def extract_message_body_iterative(self, message:dict):
        if message['raw']:
            body = base64.urlsafe_b64decode(message['raw'].encode('utf8'))
            mime_msg = email.message_from_bytes(body)
        else:
            mime_msg = message

        body_text = ''
        if mime_msg.get_content_type() == 'text/plain':
            plain_text = mime_msg.get_payload(decode=True)
            charset = mime_msg.get_content_charset('utf-8')
            body_text = plain_text.decode(charset).encode('utf-8').decode('utf-8')

        elif mime_msg.get_content_maintype() == 'multipart':
            msg_parts = mime_msg.get_payload()
            for msg_part in msg_parts:
                body_text += self.extract_message_body_iterative(msg_part)

        return body_text

    def extract_message_body(self, message: dict):
        from bs4 import BeautifulSoup
        try:
            body = base64.urlsafe_b64decode(message['raw'].encode('ASCII'))
            mime_msg = email.message_from_bytes(body)

            # If the message body contains HTML, parse it with BeautifulSoup
            if 'text/html' in mime_msg:
                soup = BeautifulSoup(body, 'html.parser')
                body = soup.get_text()
            return body.decode("ascii")
        except Exception as e:
            raise Exception("Can't parse message body" + str(e))



# Store emails into vector database and make a query

In [6]:

import openai
import os
import json

from google.cloud import secretmanager

# Set up the Secret Manager client
client1 = secretmanager.SecretManagerServiceClient()
name1 = "projects/339902708114/secrets/openai_api_key/versions/latest"
response1 = client1.access_secret_version(request={"name": name1})

# Retrieve the API key from the secret
api_key1 = response1.payload.data.decode("UTF-8")

# Set the OpenAI API key
openai.api_key = api_key1
os.environ["OPENAI_API_KEY"]=api_key1

from llama_index import VectorStoreIndex
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from llama_index import LangchainEmbedding,ServiceContext

if __name__ == "__main__":
    reader = GmailReader(query="from:me after:2023-06-14")
    documents = reader.load_data()
    index = VectorStoreIndex.from_documents(documents)

    print(index)

    query_engine = index.as_query_engine()
    response = query_engine.query("What is the subject of the recent email")
    print(response)

<llama_index.indices.vector_store.base.VectorStoreIndex object at 0x7f49981c1000>

The subject of the recent email is "Learning LlamaIndex".


# Another way to save it in vector database and make a query

In [None]:
!pip install sentence_transformers

In [None]:
!pip install InstructorEmbedding

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl (19 kB)
Installing collected packages: InstructorEmbedding
Successfully installed InstructorEmbedding-1.0.1


In [None]:
from google.cloud import secretmanager

# Set up the Secret Manager client
client1 = secretmanager.SecretManagerServiceClient()
name1 = "projects/339902708114/secrets/openai_api_key/versions/latest"
response1 = client1.access_secret_version(request={"name": name1})

# Retrieve the API key from the secret
api_key1 = response1.payload.data.decode("UTF-8")

# Set the OpenAI API key
openai.api_key = api_key1
os.environ["OPENAI_API_KEY"]=api_key1

In [None]:
import openai
import os
import json

from llama_index import VectorStoreIndex

from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from llama_index import LangchainEmbedding, ServiceContext, VectorStoreIndex

if __name__ == "__main__":
    # Create the embedding model
    embed_model = LangchainEmbedding(HuggingFaceInstructEmbeddings())

    # Create the service context
    service_context = ServiceContext.from_defaults(embed_model=embed_model)

    # Load the documents
    reader = GmailReader(query="from:me after:2023-06-14")
    documents = reader.load_data()

    # Create the vector index
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)

    # Print the index
    print(index)
    query_engine = index.as_query_engine()
    response = query_engine.query("What is the subject of the recent email")
    print(response)


load INSTRUCTOR_Transformer
max_seq_length  512
<llama_index.indices.vector_store.base.VectorStoreIndex object at 0x7f474acf5810>

The subject of the recent email is "Learning LlamaIndex".


# Try if openai works

In [None]:
from langchain.llms import OpenAI
llm= OpenAI(model_name="gpt-3.5-turbo")
llm("What's 5 to the 8th power") #simple test query

'39,0625'