In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import sys, os

print(f"Installing packages into environment {sys.executable}")

Installing packages into environment c:\anaconda3\envs\cogsearch02\python.exe


In [None]:
# !{sys.executable} -m pip install llama-index openai langchain azure-identity

# End to end process to create a download and index a single document

In [3]:
import logging

logger = logging.getLogger(__name__)


In [None]:
# get_token()

In [4]:
from llama_index import download_loader
from langchain.llms import AzureOpenAI
from llama_index import LLMPredictor
from langchain.embeddings import OpenAIEmbeddings
from llama_index import LangchainEmbedding
from azure.search.documents import SearchClient 
from azure.core.credentials import AzureKeyCredential  
from llama_index.vector_stores import CognitiveSearchVectorStore
from llama_index import VectorStoreIndex, StorageContext, ServiceContext
from llama_index import load_index_from_storage
from typing import Dict, Any 
import re
import base64 
import os
from azure.identity import DefaultAzureCredential
from pypdf import PdfReader, PdfWriter
from azure.storage.blob import BlobServiceClient
import io

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
NumExpr defaulting to 8 threads.


In [5]:
storage_account_url = "https://aoaist002.blob.core.windows.net"
loader_hub_fork_url = "https://raw.githubusercontent.com/rivms/llama-hub/azblobmetadata/llama_hub"

# aoai_base = "https://demoaoai002.openai.azure.com/"
aoai_base = "https://demofcaoai004.openai.azure.com/"
azure_kwargs={"api_type": "azure_ad", "api_version": "2023-03-15-preview", "api_base": aoai_base}

In [6]:
# Search Client Parameters
key = "2UCPzVfO4hVvxvRFFTbreE6MFSUTEmSgWYGpniC6pXAzSeBiVDiN"
credential = AzureKeyCredential(key)

service_endpoint = "https://gptkb-gdm7wgiiihc5y.search.windows.net"


#index_name = "aoaientsearch-bhp"

#index_name = "aoaientsearchsynergy"
index_name = "aoaientsearchinpex"

In [7]:
AzStorageBlobReader = download_loader("AzStorageBlobReader", loader_hub_fork_url, refresh_cache = True)

In [10]:
def load_documents(container_name: str, 
                blob: str,
                account_url: str,
                loader_hub_url: str): 
    default_credential = DefaultAzureCredential()

    loader = AzStorageBlobReader(container_name=container_name, 
        blob=blob,
        account_url=account_url, 
        credential=default_credential, 
        loader_hub_url=loader_hub_url) #, num_files_limit = 1)

    documents = loader.load_data()

    return documents


def index_document(documents: Any,
                azure_kwargs: Any,
                service_endpoint: str,
                index_name: str,
                persist_dir: str,
                credential: AzureKeyCredential
                ):
    
    default_credential = DefaultAzureCredential()
    token = default_credential.get_token("https://cognitiveservices.azure.com/.default")

    logger.info('Number of documents: {}'.format(len(documents)))


    llm_predictor = get_llm(openai_api_version=azure_kwargs["api_version"], 
                        openai_api_key=token.token, 
                        model_kwargs=azure_kwargs) 

    embeddings = get_embedding(openai_api_key=token.token, model_kwargs=azure_kwargs)


    vector_store = get_vector_store(service_endpoint, index_name, credential)

    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embeddings) # using default chunk limit

    try:
        logger.info(f"Loading storage context from {persist_dir}")
        storage_context = StorageContext.from_defaults(persist_dir=persist_dir, vector_store=vector_store)
    except FileNotFoundError as e:
        logger.info(f"Failed to load storage context from {persist_dir}, create default")
        storage_context = StorageContext.from_defaults(vector_store=vector_store)


    logger.info(f"Parsing nodes")
    nodes = service_context.node_parser.get_nodes_from_documents(documents)
    try:
        logger.info(f"Trying to load index from storage")
        cog_index = load_index_from_storage(storage_context=storage_context, service_context = service_context)
        cog_index.insert_nodes(nodes)
    except ValueError as e:
        logger.info(f"Creating a new index")
        cog_index = VectorStoreIndex(
        nodes=nodes, storage_context=storage_context, service_context=service_context)

    logger.info(f"Saving storage context to {persist_dir}")
    storage_context.persist(persist_dir=persist_dir)

    

In [14]:
def get_token():
    # Request credential
    default_credential = DefaultAzureCredential()
    token = default_credential.get_token("https://cognitiveservices.azure.com/.default")
    return token.token

def get_vector_store(service_endpoint: str, index_name: str, credential: Any) -> CognitiveSearchVectorStore:
    search_client = SearchClient(service_endpoint, index_name, credential=credential) 
    vector_store = CognitiveSearchVectorStore(search_client,
                                          id_field_key = "li_id",
                                          chunk_field_key= "content",
                                          embedding_field_key = "embedding",
                                          metadata_field_key= "li_jsonMetadata",
                                          doc_id_field_key = "li_doc_id",
                                          index_mapping = cogsearch_ent_index_mapping
    )
    return vector_store

def get_llm(openai_api_version: str, openai_api_key: str, model_kwargs: Any) -> LLMPredictor:
    llm = AzureOpenAI(temperature=0.9, 
                  deployment_name="text-davinci-003", 
                  model_name="text-davinci-003", 
                  openai_api_version= openai_api_version, 
                  openai_api_key=openai_api_key, 
                  model_kwargs=model_kwargs)

    # define LLM
    llm_predictor = LLMPredictor(llm)   
    return llm_predictor


def get_embedding(openai_api_key: str, model_kwargs: Any) -> LangchainEmbedding:
    # load in AOAI embedding model from langchain
    oai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",
                              deployment="text-embedding-ada-002",
                              openai_api_key=openai_api_key,
                              openai_api_base=model_kwargs["api_base"],
                              openai_api_type=model_kwargs["api_type"],
                              openai_api_version=model_kwargs["api_version"],
                              chunk_size=1)
    embeddings = LangchainEmbedding(oai_embeddings)
    return embeddings


def filename_to_id(filename):
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
    return f"file-{filename_ascii}-{filename_hash}"

def blob_name_from_file_page(filename, page = 0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)

def cogsearch_ent_index_mapping(
        enriched_doc: Dict[str, str], metadata: Dict[str, Any]
    ) -> Dict[str, str]:
        index_doc: Dict[str, str] = {}
        field_mapping = { "id": "id", "chunk": "content", "embedding": "contentVector", "doc_id": "document_id", "metadata": "jsonMetadata"}

        page = metadata.get("page_number", 0)
        sourcefile = metadata["file_name"]
        file_id = filename_to_id(sourcefile)
        # id, content, embedding, sourcepage, sourcefile

        index_doc["id"] = f"{file_id}-page-{page}"
        index_doc["content"] = enriched_doc["chunk"]
        index_doc["embedding"] = enriched_doc["embedding"]
        index_doc["sourcepage"] = blob_name_from_file_page(sourcefile, page)
        index_doc["sourcefile"] = sourcefile
        index_doc["li_id"] = enriched_doc["id"]
        index_doc["li_jsonMetadata"] = enriched_doc["metadata"]
        index_doc["li_doc_id"] = enriched_doc["doc_id"]

        return index_doc

def upload_blobs(filename,split_pdf_storage_account, split_pdf_container):
    default_credential = DefaultAzureCredential()
    print(f"Starting upload: {filename}")
    blob_service = BlobServiceClient(account_url=f"https://{split_pdf_storage_account}.blob.core.windows.net", credential=default_credential)
    print(f"Starting upload: {filename}")
    blob_container = blob_service.get_container_client(split_pdf_container)
    if not blob_container.exists():
        blob_container.create_container()

    # if file is PDF split into pages and upload each page as a separate blob
    if os.path.splitext(filename)[1].lower() == ".pdf":
        logger.info(f"Processing pdf")
        reader = PdfReader(filename)
        pages = reader.pages
        logger.info(f"Processing {len(pages)} pages")
        for i in range(len(pages)):
            blob_name = blob_name_from_file_page(filename, i)
            logger.info(f"Creating blob for page {i} -> {blob_name}")
            f = io.BytesIO()
            writer = PdfWriter()
            writer.add_page(pages[i])
            writer.write(f)
            f.seek(0)
            logger.info(f"Uploading blob for page {i} of {len(pages)} -> {blob_name}")
            blob_container.upload_blob(blob_name, f, overwrite=True)
            logger.info(f"Upload complete for page {i} of {len(pages)}-> {blob_name}")
    else:
        logger.info(f"Not a pdf")
        blob_name = blob_name_from_file_page(filename)
        with open(filename,"rb") as data:
            blob_container.upload_blob(blob_name, data, overwrite=True)
    

In [None]:
annual_reports = ["200915_BHPAnnualReport2020.pdf", "210914_bhpannualreport2021.pdf",
                "220906_bhpannualreport2022.pdf", "bhpannualreport2017.pdf",
                "bhpannualreport2018.pdf", "bhpannualreport2019.pdf",
                "bhpBillitonAnnualReport2010.pdf", "BHPBillitonAnnualReport2013.pdf",
                "BHPBillitonAnnualReport2014_interactive.pdf", "bhpbillitonannualreport2015.pdf",
                "bhpbillitonannualreport2016.pdf", "BHPBillitonAnnualReport2011.pdf",
                "BHPBillitonAnnualReport2012.pdf", "annualReport2009.pdf"]

In [20]:
annual_reports_inpex = ["inpex_single_annualreport202112_en.pdf", "inpex_annualreport202112_en.pdf",
                  "inpex_annualreport202012_en.pdf", "inpex_annualreport2018_en.pdf"]
persist_dir = "./llama_store_synergy"
split_pdf_storage_account = "stgdm7wgiiihc5y"
split_pdf_container = "aoaientsearchinpex"

In [None]:
annual_reports_synergy = ["Synergy 2020 Annual Report.pdf", "Synergy 2021 Annual Report.pdf", "Synergy 2022 Annual Report.pdf"]
persist_dir = "./llama_store_synergy"
split_pdf_storage_account = "stgdm7wgiiihc5y"
split_pdf_container = "aoaientsearchsynergy"

In [None]:
persist_dir = "./llama_store_bhp"
split_pdf_storage_account = "stgdm7wgiiihc5y"
split_pdf_container = "aoaientsearchbhp"

In [17]:
logger.setLevel(logging.INFO)

In [36]:
pdf = annual_reports_inpex[3]

documents = load_documents(container_name='annualreportsinpex', blob=pdf, account_url=storage_account_url,
            loader_hub_url=loader_hub_fork_url)

INFO:azure.identity._credentials.environment:No environment configuration found.
No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
ManagedIdentityCredential will use IMDS
INFO:custom_loader:Adding metadata for C:\Users\rijumna\AppData\Local\Temp\tmpa7rxo96k\mxuecv4n.pdf
Adding metadata for C:\Users\rijumna\AppData\Local\Temp\tmpa7rxo96k\mxuecv4n.pdf
INFO:custom_loader:Start download of inpex_annualreport2018_en.pdf
Start download of inpex_annualreport2018_en.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'

In [37]:
len(documents)

98

In [38]:
documents[0]

Document(id_='738481fa-3ee8-4e31-879c-64b2d86a01fe', embedding=None, metadata={'page_label': '1', 'file_name': 'inpex_annualreport2018_en.pdf', 'page_number': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='0f5ad843995bff169e4a864bb0450c4b722e9bfe6fcd4de4d6faed7cb4e5a58a', text='Annual Report 2018\nYear ended March 31, 2018\nAkasaka Biz Tower \n5-3-1 Akasaka, Minato-ku, Tokyo 107-6332, JapanPhone: +81-3-5572-0200\nhttps://www.inpex.co.jp/english', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [39]:
index_document(documents,
            azure_kwargs,
            service_endpoint,
            index_name,
            persist_dir,
            credential)

INFO:azure.identity._credentials.environment:No environment configuration found.
No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
ManagedIdentityCredential will use IMDS
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential
DefaultAzureCredential acquired a token from A

In [None]:
index_name

In [None]:
pdf = annual_reports_synergy[2]

documents = load_documents(container_name='aoaientsearchsynergy', blob=pdf, account_url=storage_account_url,
            loader_hub_url=loader_hub_fork_url)

In [None]:
len(documents)

In [None]:
documents[5]

In [None]:
index_document(documents,
            azure_kwargs,
            service_endpoint,
            index_name,
            persist_dir,
            credential)

In [42]:
local_base_dir = "C:\\Users\\rijumna\\Downloads\\"

In [40]:
split_pdf_container

'aoaientsearchinpex'

In [46]:
report = annual_reports_inpex[3]
logger.info(f"Uploading report {report}")
upload_blobs(local_base_dir + report, split_pdf_storage_account, split_pdf_container)

INFO:__main__:Uploading report inpex_annualreport2018_en.pdf
Uploading report inpex_annualreport2018_en.pdf
INFO:azure.identity._credentials.environment:No environment configuration found.
No environment configuration found.
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
ManagedIdentityCredential will use IMDS
Starting upload: C:\Users\rijumna\Downloads\inpex_annualreport2018_en.pdf
Starting upload: C:\Users\rijumna\Downloads\inpex_annualreport2018_en.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.14.0 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azs

In [None]:
upload_blobs(local_base_dir + annual_reports[4], split_pdf_storage_account, split_pdf_container)

In [None]:
upload_blobs(local_base_dir + annual_reports[12], split_pdf_storage_account, split_pdf_container)