In [1]:
import os
import json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, BlobSasPermissions, generate_blob_sas
import datetime

from dotenv import load_dotenv
load_dotenv()

True

Upload raw data (research papers) from "data/unstructured/raw" folder to an Azure Blob Storage container "raw-research-papers" before running this notebook. That will be your data store for your source data.

Azure Forms Recognizer will be used to extract text and tables from research papers. 

Azure Cognitive Search is used to index the extracted JSON documents, and retreive relevant information using semantic search.

Azure OpenAI service is used to generate the answers and summaries based on user prompts.

Add necessary credentials of your Azure Resources in .env file before proceeding. 

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [18]:
def extract_content_from_url(document_url):
    """Returns the text content of the file at the given URL."""
    #print("Analyzing", document_url)
    
    poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", document_url)
    result = poller.result()
    return result

def get_page_content(result):
    page_content = []
    for page in result.pages:
        all_lines_content = []
        for line_idx, line in enumerate(page.lines):
            all_lines_content.append(' '.join([word.content for word in line.get_words()]))
        page_content.append({'page_number':page.page_number, 
                                'page_content':' '.join(all_lines_content)})
    return page_content
    


In [4]:
def get_authenticated_urls(container_name):
    """Returns a list of tuple of (document name, authenticated URLs) for
    documents in the given container."""

    urls = []
    # Connect to the storage account
    blob_service_client = BlobServiceClient.from_connection_string(os.environ['AZURE_BLOB_STORAGE_CONNECTION_STRING'])
    container_client = blob_service_client.get_container_client(container_name)

    # Iterate over the blobs in the container
    blob_list = container_client.list_blobs()
    for blob in blob_list:
        # Retrieve the URL of the blob
        blob_client = container_client.get_blob_client(blob.name)
        blob_url = blob_client.url

        #print(f"Generating authenticated URL for: {blob.name}")

        blob_sas = generate_blob_sas(
            account_name=container_client.account_name,
            account_key=container_client.credential.account_key,
            container_name=container_name,
            blob_name=blob.name,
            permission=BlobSasPermissions(read=True),
            expiry=datetime.datetime.utcnow() + datetime.timedelta(hours=1))

        authenticated_url = f"{blob_url}?{blob_sas}"
        urls.append((blob.name, authenticated_url))
    return urls

In [13]:
import html

def table_to_html(table):
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

def get_document_text(result):
    offset = 0
    page_map = []

    for page_num, page in enumerate(result.pages):
        tables_on_page = [table for table in result.tables if table.bounding_regions[0].page_number == page_num + 1]

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1]*page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >=0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing charcters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += result.content[page_offset + idx]
            elif not table_id in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        #page_map.append((page_num, offset, page_text))
        page_map.append({'page_number':page_num, 
                         'offset':offset, 
                         'page_text':page_text}) 
        offset += len(page_text)
        
    return page_map

In [6]:
raw_container_name = 'raw-research-papers'
extracted_container_name = 'extracted-research-papers'

blob_service_client = BlobServiceClient.from_connection_string(os.environ['AZURE_BLOB_STORAGE_CONNECTION_STRING'])
extracted_container_client = blob_service_client.get_container_client(container=extracted_container_name)
    
if not extracted_container_client.exists():
    extracted_container_client.create_container()

In [33]:
document_urls = get_authenticated_urls(raw_container_name)

documents = []
for document_name, document_url in document_urls:
    result = extract_content_from_url(document_url)
    page_map = get_document_text(result)
    doc = {'filename':document_name, 'file_path':document_url, 'content':page_map}
    
    documents.extend(
        [
            {
                'id': document_name.split('.')[0] + '-' + str(page['page_number']),
                'file_name': document_name,
                'file_path': document_url,
                'page_number': page['page_number'],
                'page_text': page['page_text']
            }
            for page in doc['content']
        ]
    )
                                
    blob_client = extracted_container_client.get_blob_client(blob=document_name[:-3] +'json')
    blob_client.upload_blob(json.dumps(doc), overwrite=True)
    print(f"Uploaded extracted content for: {document_name}")


Uploaded extracted content for: AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf

Uploaded extracted content for: Chain-of-Thought_Prompting_Elicits_Reasoning_in_LLMs.pdf

Uploaded extracted content for: Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf

Uploaded extracted content for: LLMs_are_Human-Level_Prompt_Engineers.pdf

Uploaded extracted content for: Power_of_Scale_for_Parameter-Efficient_Prompt_Tuning.pdf

Uploaded extracted content for: Precise_Zero-Shot_Dense_Retrieval_without_Relevance_Labels.pdf

Uploaded extracted content for: Prefix-Tuning_Optimizing_Continuous_Prompts_for_Generation.pdf

Uploaded extracted content for: Self-Consistency_Improves_Chain-of-Thought_Reasonsing_in_LLMs.pdf



In [34]:
#Example of a single page of research paper file that will be indexed in Azure Cognitive Search
documents[5]

{'id': 'AutoPrompt_Eliciting_Knowledge_From_LanguageModels-5',
 'file_name': 'AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf',
 'file_path': 'https://shivaaistorage.blob.core.windows.net/raw-research-papers/AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf?se=2023-06-18T21%3A14%3A18Z&sp=r&sv=2022-11-02&sr=b&sig=M2u0l3Kq47hOcAObXsM7%2B05DV5Drfo957hyGoYpFZ5I%3D',
 'page_number': 5,
 'page_text': '<table><tr><th rowSpan=2>Model</th><th colSpan=3>SICK-E Datasets</th></tr><tr><th>standard</th><th>3-way</th><th>2-way</th></tr><tr><td>Majority</td><td>56.7</td><td>33.3</td><td>50.0</td></tr><tr><td>BERT (finetuned)</td><td>86.7</td><td>84.0</td><td>95.6</td></tr><tr><td>BERT (linear probing)</td><td>68.0</td><td>49.5</td><td>91.9</td></tr><tr><td>RoBERTa (linear probing)</td><td>72.6</td><td>49.4</td><td>91.1</td></tr><tr><td>BERT (AUTOPROMPT)</td><td>62.3</td><td>55.4</td><td>85.7</td></tr><tr><td>RoBERTa (AUTOPROMPT)</td><td>65.0</td><td>69.3</td><td>87.3</td></tr></table>\nTab

In [35]:
# Import Azure Forms Recognizer, Azure Cognitive Search, OpenAI, and other python modules

import os, json
import requests
from pprint import pprint
import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings
)


import openai
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity

In [36]:
# Create an SDK client
service_endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]  
key = os.environ["AZURE_COGNITIVE_SEARCH_KEY"]
credential = AzureKeyCredential(key)

index_name = "research-paper-blob-index"

index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
index_client

<azure.search.documents.indexes._search_index_client.SearchIndexClient at 0x228588fe390>

In [37]:
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="page_number", type=SearchFieldDataType.Int64),
    SimpleField(name="file_path", type=SearchFieldDataType.String),
    SearchableField(name="file_name", type=SearchFieldDataType.String,
                searchable=True, retrievable=True),
    SearchableField(name="page_text", type=SearchFieldDataType.String,
                filterable=True, searchable=True, retrievable=True),
]

semantic_config = SemanticConfiguration(
    name="defualt",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="file_name"),
        prioritized_content_fields=[SemanticField(field_name="page_text")]
    )
)


# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 research-paper-blob-index created


In [38]:
len(documents)

179

In [39]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(result)} documents") 

Uploaded 179 documents


In [40]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = os.environ['OPENAI_API_VERSION']

text_model = os.environ['TEXT_DAVINCI_NAME']
chat_model = os.environ['CHAT_MODEL_NAME']
embedding_model=os.environ['EMBEDDING_MODEL_NAME']

In [42]:
# Handling Rate Limits

from tenacity import retry, stop_after_attempt, wait_random_exponential # for exponential backoff
from openai.error import RateLimitError
from time import sleep


def get_embedding(text: str, engine: str = "text-embedding-ada-002"):
    while True:
        try:
            embedding = openai.Embedding().create(input=[text], engine=engine)["data"][0]["embedding"]
            break;
        except RateLimitError:
            sleep(2)            
    return np.array(embedding).astype(np.float32)

@retry(wait=wait_random_exponential(min=1, max=30), stop=stop_after_attempt(10))
def get_completion(prompt, model="gpt-35-turbo"): # Andrew mentioned that the prompt/ completion paradigm is preferable for this class
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        engine=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]


In [49]:
query = "What is automated prompt engineering?"
count = 10
results = search_client.search(search_text=query, top=count, include_total_count=True)
page_chunks = []
citations = []
for result in results:
    page_chunks.append(result['page_text'])
    citations.append(result['file_name'])
    
embed_df = pd.DataFrame({"retrieved_page":page_chunks, "file_name":citations}) #datframe with document chunks and citations
embed_df['embedding'] = embed_df["retrieved_page"].apply(lambda page_text : get_embedding(page_text, engine = embedding_model))
embed_df

Unnamed: 0,retrieved_page,file_name,embedding
0,Translation en-es\nInstruction Only\nIn-contex...,LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.028809441, 0.010793041, 0.017176475, -0.01..."
1,p(y|xprompt) = 3 p([MASK] = w|xprompt)\nHoweve...,AutoPrompt_Eliciting_Knowledge_From_LanguageMo...,"[-0.03455013, -0.008062171, -0.0059134425, -0...."
2,arXiv:2211.01910v1 [cs.LG] 3 Nov 2022\nLARGE L...,LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.023766506, -0.004874184, 0.008511266, -0.0..."
3,A PROMPT ENGINEERING IN THE WILD\nLarge models...,LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.023554025, 0.004547, 0.006089732, -0.00838..."
4,Original Input Cinp a real joy.\nAUTOPROMPT Ip...,AutoPrompt_Eliciting_Knowledge_From_LanguageMo...,"[-0.028821949, -0.01375198, 0.005047462, -0.01..."
5,"Tianyu Gao, Adam Fisch, and Danqi Chen. Making...",LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.013933532, 0.0027675957, 0.015302562, -0.0..."
6,how to write the chain of thought annotations ...,Chain-of-Thought_Prompting_Elicits_Reasoning_i...,"[-0.02224152, 0.003555452, -0.002306708, -0.02..."
7,Task NumerSense\nPrompt\nGenerate some numeric...,Generated_Knowledge_Prompting_for_Commonsense_...,"[-0.0053810575, 0.010437038, 0.028689751, 0.00..."
8,AUTOPROMPT: Eliciting Knowledge from Language ...,AutoPrompt_Eliciting_Knowledge_From_LanguageMo...,"[-0.017496426, 0.0055379565, 0.0054711495, -0...."
9,Question\nTracy used a piece of wire 4 feet lo...,Chain-of-Thought_Prompting_Elicits_Reasoning_i...,"[0.007523509, 0.012140825, 0.0124328025, -0.03..."


In [63]:
query_embedding = get_embedding(query, engine=embedding_model)
embed_df["similarities"] = embed_df['embedding'].apply(lambda page_embedding: cosine_similarity(page_embedding, query_embedding))

top_results = (
    embed_df.sort_values("similarities", ascending=False)
    .reset_index(drop=True)
    .head(3)
)
top_results

Unnamed: 0,retrieved_page,file_name,embedding,similarities
0,A PROMPT ENGINEERING IN THE WILD\nLarge models...,LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.023554025, 0.004547, 0.006089732, -0.00838...",0.885291
1,arXiv:2211.01910v1 [cs.LG] 3 Nov 2022\nLARGE L...,LLMs_are_Human-Level_Prompt_Engineers.pdf,"[-0.023766506, -0.004874184, 0.008511266, -0.0...",0.847408
2,AUTOPROMPT: Eliciting Knowledge from Language ...,AutoPrompt_Eliciting_Knowledge_From_LanguageMo...,"[-0.017496426, 0.0055379565, 0.0054711495, -0....",0.835019


In [77]:
prompt = f"""
    You are a research assistant and can help summarize and answer questions on research papers.
    Answer ONLY with the facts listed in the sources below. If there isn't enough information below, say you don't know. 
    Each source is a dictionary with file_name and information available in retrieved_page, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.pdf]. Don't combine sources, list each source separately, e.g. [info1.pdf][info2.pdf].
    Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
    The question and sources are delimited by triple backticks.
    Sourcers contain both text and tables. Tables in presented in HTML format. Parse the HTML step-by-step to extract the information you need.

    Question: ```{query}``` \n
    Sources: ```{top_results[['retrieved_page', 'file_name']].to_dict('records')}``` \n

    Answer:
"""

response = get_completion(prompt)
print(response)

Automated Prompt Engineering (APE) is a method proposed in the paper "Large Language Prompt Engineers Models are Human-Level" [LLMs_are_Human-Level_Prompt_Engineers.pdf] for automatic instruction generation and selection. APE treats the instruction as the "program," optimized by searching over a pool of instruction candidates proposed by a large language model (LLM) in order to maximize a chosen score function. The goal of APE is to reduce the human effort involved in creating and validating effective instructions. APE-engineered prompts can be applied to steer models toward truthfulness and/or informativeness, as well as to improve few-shot learning performance by simply prepending them to standard in-context learning prompts.


In [102]:

def query_search(query, count=10):
    results = search_client.search(search_text=query, top=count, include_total_count=True)
    page_chunks = []
    citations = []
    for result in results:
        page_chunks.append(result['page_text'])
        citations.append(result['file_name'])
                
    embed_df = pd.DataFrame({"retrieved_page":page_chunks, "file_name":citations}) #datframe with document chunks and citations
        
    #Create an embedding vector for each chunk that will capture the semantic meaning and overall topic of that chunk
    embed_df["embedding"] = embed_df["retrieved_page"].apply(lambda page_text : get_embedding(page_text, engine = embedding_model))

    query_embedding = get_embedding(query, engine=embedding_model)
    embed_df["similarities"] = embed_df["embedding"].apply(lambda page_embedding: cosine_similarity(page_embedding, query_embedding))

    top_results = (
        embed_df.sort_values("similarities", ascending=False)
        .reset_index(drop=True)
        .head(3)
    )
    
    prompt = f"""
        You are a research assistant and can help summarize and answer questions on research papers.
        Answer ONLY with the facts listed in the sources below. If there isn't enough information below, say you don't know. 
        Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
        Each source is a dictionary with actual information available in retrieved_page key and source name is the file_name key. Always include the source name separately at end of the response in a new line. Use square brakets to reference the source name, e.g. [Research_Paper.pdf]. Don't combine sources, list each source separately, e.g. [ResarchPaper1.pdf][Research_Paper.pdf].
        The question and sources are delimited by triple backticks.
        Sourcers contain both text and tables. Tables in presented in HTML format. Parse the HTML step-by-step to extract the information you need.

        Question: ```{query}``` \n
        Sources: ```{top_results[['retrieved_page', 'file_name']].to_dict('records')}``` \n

        Answer:
    """
    
    response = get_completion(prompt)
    return response

In [103]:
answer = query_search("what is the conclusion from the paper - AUTOPROMPT: Eliciting Knowledge from Language Models with Automatically Generated Prompts?", 10)
print(answer)

The conclusion from the paper "AUTOPROMPT: Eliciting Knowledge from Language Models with Automatically Generated Prompts" is that automatically generated prompts are a viable parameter-free alternative to existing probing methods, and as pretrained LMs become more sophisticated and capable, potentially a replacement for finetuning. The paper shows that masked language models (MLMs) have an inherent capability to perform sentiment analysis and natural language inference without additional parameters or finetuning, sometimes achieving performance on par with recent state-of-the-art supervised models. The prompts generated by AUTOPROMPT elicit more accurate factual knowledge from MLMs than the manually created prompts on the LAMA benchmark, and MLMs can be used as relation extractors more effectively than supervised relation extraction models. [AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf]


In [104]:
answer = query_search("What are the examples of Knowledge Generation methods?", 5)
print(answer)

Examples of Knowledge Generation methods include:
- Generating numerical facts about objects [Task NumerSense][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
- Generating knowledge about concepts in the input [Task Prompt CSQA][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
- Generating knowledge about the input [Task QASC][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
- Providing explanations or definitions for given inputs [Task Prompt CSQA2][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]

[Task NumerSense][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
[Task Prompt CSQA][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
[Task QASC][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]
[Task Prompt CSQA2][Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf]


In [89]:
answer = query_search("List all the research papers that have info about automated prompt engineering.", 5)
print(answer)

There is only one research paper that mentions automated prompt engineering, and it is "LLMs are Human-Level Prompt Engineers" [LLMs_are_Human-Level_Prompt_Engineers.pdf].


In [95]:
answer = query_search("Expand CoT Prompting and explain what it is.", 10)
print(answer)

CoT Prompting stands for Chain of Thought Prompting. It is a type of prompting that enables length generalization to longer inference examples on two symbolic manipulation tasks and elicits reasoning in large language models (LLMs). It is an emergent ability of model scale and does not positively impact performance until used with a model of sufficient scale. [Chain-of-Thought_Prompting_Elicits_Reasoning_in_LLMs.pdf]


In [97]:
answer = query_search("I am looking for Natural Language Inference performance on the SICK-E test set and variants. Could you list the observations in a tabular format?", 10)
print(answer)

The Natural Language Inference (NLI) performance on the SICK-E test set and variants is presented in Table 2 of [AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf]. The table shows the performance of various models on the standard, 3-way, and 2-way datasets of SICK-E. The models include Majority, BERT (finetuned), BERT (linear probing), RoBERTa (linear probing), BERT (AUTOPROMPT), and RoBERTa (AUTOPROMPT). The table shows that BERT (finetuned) has the highest performance on all three datasets, followed by RoBERTa (AUTOPROMPT) on the 3-way dataset and BERT (linear probing) on the 2-way dataset. The observations are as follows:

| Model                   | Standard | 3-way | 2-way |
|-------------------------|----------|-------|-------|
| Majority                | 56.7     | 33.3  | 50.0  |
| BERT (finetuned)        | 86.7     | 84.0  | 95.6  |
| BERT (linear probing)   | 68.0     | 49.5  | 91.9  |
| RoBERTa (linear probing)| 72.6     | 49.4  | 91.1  |
| BERT (AUTOPROMPT)       | 62

In [111]:
answer = query_search("what is BEIR? Show low resource tasks from BEIR in a table.", 10)
print(answer)

BEIR stands for "Benchmarking Indexing and Retrieval" and is a dataset for low-resource retrieval tasks. Table 2 in [Precise_Zero-Shot_Dense_Retrieval_without_Relevance_Labels.pdf] shows the low resource tasks from BEIR and their results.
