### Install Python Packages

In [None]:
%pip install azure-functions
%pip install azure-core
%pip install azure-cosmos
%pip install openai
%pip install numpy
%pip install requests
%pip install pandas
%pip install azure-storage-blob 
%pip install azure-identity
%pip install smart_open
%pip install tenacity
%pip install pinecone-client
%pip install redis
%pip install tiktoken
%pip install azure-storage-file-share
%pip install python-dotenv
%pip install azure-search-documents==11.4.0b3
%pip install azure-ai-formrecognizer
%pip install beautifulsoup4
%pip install lxml
%pip install azure-ai-textanalytics
%pip install langchain

### Import Packages

In [4]:
import os
from dotenv import load_dotenv
load_dotenv(override=True)

import shutil
import sys
sys.path.append('./utils')

### Uncomment below imports as needed -- make sure that all relevant values and keys in the .env file are properly populated
# from utils import redis_helpers
# from utils import helpers
# from utils import language
# from utils import openai_helpers
# from utils import storage
# from utils import bot_helpers

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Activate Cognitive Search Ingestion 
#### First Run - Create Index and Indexer 
##### Caution: this will destroy any data you might already have in your index

In [6]:
#### Ingest all knowledge base documents
from utils import cogsearch_helpers

KB_BLOB_CONTAINER = os.environ["KB_BLOB_CONTAINER"]

cogsearch_helpers.ingest_kb(container = KB_BLOB_CONTAINER)


Index km-openai-sem Deleted
Index km-openai-sem created
Index km-openai Deleted
Index km-openai created
Deleted Skillset - km-openai-skills
Created new Skillset - km-openai-skills
Deleted Indexer - km-openai-indexer
Deleted Data Source - km-openai-skills
Created new Data Source Connection - km-openai-docs
Created new Indexer - km-openai-indexer
Running Indexer km-openai-indexer


#### Additional Runs - Re-indexing with delta documents

In [4]:
### Re-index additional documents
from utils import cogsearch_helpers

cogsearch_helpers.run_indexer()

Running Indexer km-openai-indexer


### Activate Form Recognizer Ingestion

In [2]:
#### Ingest all form documents

from utils import storage
from utils import fr_helpers

FR_CONTAINER = os.environ['FR_CONTAINER']
OUTPUT_BLOB_CONTAINER = os.environ['OUTPUT_BLOB_CONTAINER']


fr_helpers.process_forms(in_container = FR_CONTAINER, out_container = OUTPUT_BLOB_CONTAINER)



### Scrape Web Pages

In [None]:
### Scrape data from web pages
### This saves the text from the web pages into the processed folder, and any files which are found are saved
### to the demo folder. For the files to be processed, you need to run the cognitive search re-indexer

# TODO: Some files may be better parsed by form recognizer. Need to decide how to know which is more suitable, 
# and place them in the correct folder. 

# Enter the URLs (only enter the root url, the crawler traverses the hierarchy of the webpages below the root) 
# that you want to scrape in the urls list. Each url should be a string
urls = []

from utils import web_crawler

KB_BLOB_CONN_STR = os.environ["KB_BLOB_CONN_STR"]
KB_BLOB_CONTAINER = os.environ["KB_BLOB_CONTAINER"]
OUTPUT_BLOB_CONTAINER = os.environ['OUTPUT_BLOB_CONTAINER']

for url in urls:
    web_crawler.crawl(urls, KB_BLOB_CONN_STR, KB_BLOB_CONTAINER, OUTPUT_BLOB_CONTAINER)


### Interrogate the APIs with the sample Knowledge Base

In [None]:
### Use this cell to query Redis with the below queries
import json
import os
from utils import bot_helpers

 
queries = [
        "where does the arabian oryx live?"
    ]


for q in queries:
    output = bot_helpers.openai_interrogate_text(q, None, 'orig_lang:en')
    output = json.loads(output)
    print("\n\n", output['answer'], '\n\n\n###############################')
    


In [39]:
### other containers could be used as the sources of documents to index

cogsearch_helpers.create_indexer('kmoaidemo2') 
cogsearch_helpers.run_indexer()

Deleted Indexer - km-openai-indexer
Deleted Data Source - km-openai-skills
Created new Data Source Connection - km-openai-docs
Created new Indexer - km-openai-indexer
Running Indexer km-openai-indexer


## Experimentation Code Below - NO NEED TO RUN 
### For your reference only

In [1]:
#### Reset Index in Redis
from utils import redis_helpers

reset_index = True

if reset_index:
    redis_conn = redis_helpers.get_new_conn()
    redis_helpers.redis_reset_index(redis_conn)

In [None]:
### Use this cell to load embeddings directly into Redis from this notebook

import json


CHOSEN_EMB_MODEL   = os.environ['CHOSEN_EMB_MODEL']
SMALL_EMB_TOKEN_NUM  = int(os.environ['SMALL_EMB_TOKEN_NUM'])
MEDIUM_EMB_TOKEN_NUM  = int(os.environ['MEDIUM_EMB_TOKEN_NUM'])
LARGE_EMB_TOKEN_NUM  = int(os.environ['LARGE_EMB_TOKEN_NUM'])


emb_documents = []


for item in os.listdir("dump"):
    path = os.path.join("dump", item)

    with open(path, 'r') as openfile:
        data = json.load(openfile)
        
    emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, SMALL_EMB_TOKEN_NUM,  text_suffix = 'S')

    if MEDIUM_EMB_TOKEN_NUM != 0:
        emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, MEDIUM_EMB_TOKEN_NUM, text_suffix = 'M')

    if LARGE_EMB_TOKEN_NUM != 0:
        emb_documents += helpers.generate_embeddings(data, CHOSEN_EMB_MODEL, LARGE_EMB_TOKEN_NUM,  text_suffix = 'L')


helpers.load_embedding_docs_in_redis(emb_documents)

In [None]:
emb_documents = []

emb_documents += helpers.generate_embeddings_from_json_docs('dump', ADA_002_EMBEDDING_MODEL, ADA_002_MODEL_MAX_TOKENS, text_suffix='XL', limit=-1)

print(f"Generated {len(emb_documents)} embeddings.")
helpers.save_embdding_docs_to_pkl(emb_documents, "test.pkl")

In [24]:
emb_documents = helpers.load_embedding_docs_from_pkl("test.pkl")
helpers.load_embedding_docs_in_redis(emb_documents)

Loading 141 embeddings into Redis


In [None]:
queries = [
        "in which classes did the Danish sailors qualify?",
        "what are the reviews of the Lost City hotel?"
    ]


for q in queries:
    output = bot_helpers.openai_interrogate_text(q, DAVINCI_003_COMPLETIONS_MODEL, ADA_002_EMBEDDING_MODEL, 5, False)
    print("\n\n", output, '\n\n\n###############################')
    break

