Initializing Necessary Azure SDK Clients for Cognitive Search and OpenAI

In [None]:
"""This program utilizes Azure Cognitive Search and Azure OpenAI to answer questions
from uploaded PDF documents in Azure Blob Storage.
It leverages Python virtual environment for development.
"""
import os
import openai
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType
from azure.search.documents.models import QueryLanguage
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient

# Replace these with your own values, either in environment variables or directly here
AZURE_STORAGE_ACCOUNT = os.environ.get("AZURE_STORAGE_ACCOUNT") or "your-storage-account-name"
AZURE_STORAGE_CONTAINER = os.environ.get("AZURE_STORAGE_CONTAINER") or "your-container-name"
AZURE_SEARCH_SERVICE = os.environ.get("AZURE_SEARCH_SERVICE") or "your-cg-search-service-name"
AZURE_SEARCH_INDEX = os.environ.get("AZURE_SEARCH_INDEX") or "your-cg-search-index-name"
AZURE_OPENAI_SERVICE = os.environ.get("AZURE_OPENAI_SERVICE") or "your-openai-service-name"
AZURE_OPENAI_GPT_DEPLOYMENT = os.environ.get("AZURE_OPENAI_GPT_DEPLOYMENT") or "your-model-name"
AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT") or "chat"

KB_FIELDS_CONTENT = os.environ.get("KB_FIELDS_CONTENT") or "content"
KB_FIELDS_CATEGORY = os.environ.get("KB_FIELDS_CATEGORY") or "category"
KB_FIELDS_SOURCEPAGE = os.environ.get("KB_FIELDS_SOURCEPAGE") or "metadata_storage_name"

# Used by the OpenAI SDK
openai.api_type = "azure"
openai.api_base = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com"
openai.api_version = "2022-12-01"

# set your API key in the OPENAI_API_KEY environment variable instead
openai.api_key = 'YOUR OPENAI API KEY'
# az search admin-key show --resource-group <myresourcegroup> --service-name <myservice>
search_key = "YOUR Cognitive Search Admin/Query KEY"

# Use the current user identity to authenticate with Azure OpenAI, Cognitive Search and Blob Storage
# (no secrets needed, 
# just use 'az login' locally, and managed identity when deployed on Azure).
# If you need to use keys, use separate AzureKeyCredential instances with the 
# keys for each service
# If you encounter a blocking error during a DefaultAzureCredntial resolution, you can exclude the problematic credential by using a parameter (ex. exclude_shared_token_cache_credential=True)
az_credential = DefaultAzureCredential()
azure_credential = AzureKeyCredential(search_key)
# Set up clients for Cognitive Search and Storage
search_client = SearchClient(
    endpoint=f"https://{AZURE_SEARCH_SERVICE}.search.windows.net",
    index_name=AZURE_SEARCH_INDEX,
    credential=azure_credential)
#Set up clients for Blob STORAGE
blob_client = BlobServiceClient(
    account_url=f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", 
    credential=az_credential)
blob_container = blob_client.get_container_client(AZURE_STORAGE_CONTAINER)

Custom Prompt Template for querying into respective enterprise data

In [None]:
template = \
"You are an intelligent assistant helping users with their Diabetes related questions. " + \
"Use 'you' to refer to the individual asking the questions even if they ask with 'I'. " + \
"Answer the following question using only the data provided in the sources below. " + \
"""

###
Question: 'What is Type 1 Diabetes?'

Sources:
info1.txt: Type 1 diabetes, also known as insulin-dependent diabetes or juvenile-onset diabetes, typically develops in childhood or adolescence. 
info2.pdf: Type 1 diabetes occurs when the immune system mistakenly attacks and destroys the insulin-producing beta cells in the pancreas.

Answer:
Type 1 diabetes, also known as insulin-dependent diabetes or juvenile-onset diabetes, typically develops in childhood or adolescence. It occurs when the immune system mistakenly attacks and destroys the insulin-producing beta cells in the pancreas. The exact cause of this autoimmune response is not fully understood, but genetic and environmental factors are thought to play a role.

###
Question: '{q}'?

Sources:
{retrieved}

Answer:
"""

Fetching results from Cognitive Search based on user query. Results are fetched via text search. Semantic search is not applied.

In [None]:
user_input = "What is type1 Diabetes?"

# Exclude category, to simulate scenarios where there's a set of docs you can't see
exclude_category = None
search = user_input
print("Searching:", search)
print("-------------------")
# Filter out documents with a specific category
filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
# Perform the search using Azure Cognitive Search
r = search_client.search(search,
                         query_type=QueryType.SIMPLE,                          
                         top=3)
# Extract the relevant information from the search results
results = [doc[KB_FIELDS_SOURCEPAGE] + ": " + doc[KB_FIELDS_CONTENT].replace("\n", "").replace("\r", "") for doc in r]
content = "\n".join(results)
print("***********************")
print(content)


Basic Text Summarization function. If document size is large and text is complex, pre trained models 
like peagus/BART from transformers (HuggingFace) can be leveraged. 

Note: Execution time of models might vary on the token length provided and which in turn would be dependent on text provided.

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc]
    word_frequencies = {}

    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    # Maximum frequency of word
    max_frequency = max(word_frequencies.values())
    # Normalization of word frequency
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word]/max_frequency
    # In this part, each sentence is weighed based on how often it contains the token.
    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
    select_length = int(len(sentence_tokens)*per)
    # Summary for the sentences with maximum score. Here, each sentence in the list is of spacy.span type
    summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    # Prepare for final summary
    final_summary = [word.text for word in summary]
    #convert to a string
    summary = ''.join(final_summary)
    return summary

Defining Parameters for text cleaning

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

Text Cleaning Functionality

In [None]:
from bs4 import BeautifulSoup 

import re
def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = [w for w in newString.split() if not w in STOP_WORDS]
    long_words=[]
    for i in tokens:
        if len(i)>=3:                  #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

Execution of text summarization and then post processing of summarized text via Text Cleaning before passing to GPT Prompt.

In [None]:
summary = summarize(content, 0.05)
result = text_cleaner(summary)

Now that we have text summarized, that means token length would be significantly reduced and even cheaper OpenAI models like DaVinci instead of GPT 3.5 turbo or GPT4 can be used.

In [None]:
# Generate the prompt for the OpenAI model using the template and retrieved information
prompt = template.format(q=user_input, retrieved=result)
# Call the OpenAI GPT model to get the answer
completion = openai.Completion.create(
    engine= AZURE_OPENAI_GPT_DEPLOYMENT, 
    prompt=prompt, 
    temperature= 0.3, 
    max_tokens=1024, 
    n=1, 
    stop=["\n"])
# Print the answer and additional information
print(completion)
print({"data_points": results, "answer": completion.choices[0].text, "thoughts": f"Question:<br>{user_input}<br><br>Prompt:<br>" + prompt.replace('\n', '<br>')})


Bonus: Using google's Peagus model from Hugging Face.
Note: Model has a constrained of 1024 input tokens and processing text summarization of longer texts will result in higher execution time

In [None]:
from transformers import PegasusForConditionalGeneration
from transformers import PegasusTokenizer
# Pick model
model_name = "google/pegasus-xsum"

# Load pretrained tokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(model_name)

Due to model's token limit, we are dividing our text in batches and passing to model.
Later, we are extracting summary from each batch and joining them together.

Note: this method may not be very efficient while computing

In [None]:
# Preprocess the text and split it into smaller sections
section_size = 1024  # Adjust this value based on the model's token limit

# Passing my Cognitive search 'content':
sections = [content[i:i+section_size]
            for i in range(0, len(content), section_size)]

# Initialize the list to store summaries
summaries = []

# Generate summaries for each section
for section in sections:
    # Tokenize the section
    input_ids = pegasus_tokenizer.encode(
        section, truncation=True, max_length=1024, return_tensors='pt')

    # Generate the summary
    summary_ids = pegasus_model.generate(
        input_ids, num_beams=4, max_length=100, early_stopping=True)

    # Decode the summary tokens back to text
    summary = pegasus_tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

    # Append the summary to the list
    summaries.append(summary)

# Combine the summaries into a final summary
final_summary = " ".join(summaries)

# Print the final summary
print(final_summary)    