In [1]:
import os, json, requests, sys, re
import requests
from pprint import pprint
import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings
)


import openai
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = os.environ['OPENAI_API_VERSION']

text_model = os.environ['TEXT_DAVINCI_NAME']
chat_model = os.environ['CHAT_MODEL_NAME']
embedding_model=os.environ['EMBEDDING_MODEL_NAME']

In [14]:
# -- raw data
RAW_DATA_FOLDER= './data/unstructured/raw'
# -- extracted json file 
EXTRACTED_DATA_FOLDER = './data/unstructured/extracted'

In [15]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [16]:
def extract_local_single_file(file_name: str):
    not_completed = True
    while not_completed:
        with open(file_name, "rb") as f:
            poller = document_analysis_client.begin_analyze_document(
                "prebuilt-layout", document=f
            )
            not_completed=False
    result = poller.result()
    return get_page_content(file_name, result)

def extract_files( folder_name: str, destination_folder_name: str):
    os.makedirs(destination_folder_name, exist_ok=True)
    for file in os.listdir(folder_name):
        if file[-3:].upper() in ['PDF','JPG','PNG']:
            print('Processing file:', file, end='')
        
            page_content = extract_local_single_file(os.path.join(folder_name, file))
            output_file = os.path.join(destination_folder_name, file[:-3] +'json')
            print(f'  write output to {output_file}')
            with open(output_file, "w") as f:
                f.write(json.dumps(page_content))


def get_page_content(file_name:str, result):
    page_content = []
    for page in result.pages:
        all_lines_content = []
        for line_idx, line in enumerate(page.lines):
            all_lines_content.append(' '.join([word.content for word in line.get_words()]))
        page_content.append({'page_number':page.page_number, 
                                'page_content':' '.join(all_lines_content)})
    return {'filename':file_name, 'content':page_content}





In [17]:
extract_files(RAW_DATA_FOLDER, EXTRACTED_DATA_FOLDER)

Processing file: AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf  write output to ./data/unstructured/extracted\AutoPrompt_Eliciting_Knowledge_From_LanguageModels.json
Processing file: Chain-of-Thought_Prompting_Elicits_Reasoning_in_LLMs.pdf  write output to ./data/unstructured/extracted\Chain-of-Thought_Prompting_Elicits_Reasoning_in_LLMs.json
Processing file: Generated_Knowledge_Prompting_for_Commonsense_Reasoning.pdf  write output to ./data/unstructured/extracted\Generated_Knowledge_Prompting_for_Commonsense_Reasoning.json
Processing file: LLMs_are_Human-Level_Prompt_Engineers.pdf  write output to ./data/unstructured/extracted\LLMs_are_Human-Level_Prompt_Engineers.json
Processing file: Power_of_Scale_for_Parameter-Efficient_Prompt_Tuning.pdf  write output to ./data/unstructured/extracted\Power_of_Scale_for_Parameter-Efficient_Prompt_Tuning.json
Processing file: Precise_Zero-Shot_Dense_Retrieval_without_Relevance_Labels.pdf  write output to ./data/unstructured/extracted\Precis

In [22]:
documents=[]
for file in os.listdir(EXTRACTED_DATA_FOLDER):
    with open(os.path.join(EXTRACTED_DATA_FOLDER, file)) as f:
        page_content= json.loads(f.read())
    documents.extend([{'document_id':page_content['filename'].split('\\')[-1].split('.')[0] + '-' + str(page['page_number']),\
                        'document_name':page_content['filename'].split('\\')[-1],\
                        'file_path':page_content['filename'],\
                        'page_number': page['page_number'],\
                        'page_text': page['page_content'] } \
                        for page in page_content['content'] ])

In [23]:
#Example of a single page of research paper file that will be indexed in Azure Cognitive Search
documents[0]

{'document_id': 'AutoPrompt_Eliciting_Knowledge_From_LanguageModels-1',
 'document_name': 'AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf',
 'file_path': './data/unstructured/raw\\AutoPrompt_Eliciting_Knowledge_From_LanguageModels.pdf',
 'page_number': 1,
 'page_text': 'AUTOPROMPT: Eliciting Knowledge from Language Models with Automatically Generated Prompts Taylor Shin*♦ Yasaman Razeghi∗♦ Eric Wallace♠ ♦University of California, Irvine Robert L. Logan IV∗♦ Sameer Singh♦ ♠University of California, Berkeley {tshin1, yrazeghi, rlogan, sameer}@uci.edu ericwallace@berkeley.edu arXiv:2010.15980v2 [cs.CL] 7 Nov 2020 Abstract The remarkable success of pretrained lan- guage models has motivated the study of what kinds of knowledge these models learn dur- ing pretraining. Reformulating tasks as fill- in-the-blanks problems (e.g., cloze tests) is a natural approach for gauging such knowledge, however, its usage is limited by the manual effort and guesswork required to write suit- able pr

In [24]:
# Create an SDK client
service_endpoint = os.getenv("AZURE_COGNITIVE_SEARCH_ENDPOINT")   
key = os.getenv("AZURE_COGNITIVE_SEARCH_KEY")
credential = AzureKeyCredential(key)

index_name = "research-paper-index"

index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
index_client

<azure.search.documents.indexes._search_index_client.SearchIndexClient at 0x25edce808b0>

In [25]:
fields = [
    SimpleField(name="document_id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="page_number", type=SearchFieldDataType.Int64),
    SimpleField(name="file_path", type=SearchFieldDataType.String),
    SearchableField(name="document_name", type=SearchFieldDataType.String,
                searchable=True, retrievable=True),
    SearchableField(name="page_text", type=SearchFieldDataType.String,
                filterable=True, searchable=True, retrievable=True),
]

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="document_id"),
        prioritized_keywords_fields=[SemanticField(field_name="document_name")],
        prioritized_content_fields=[SemanticField(field_name="page_text")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 research-paper-index created


In [26]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(documents)} documents") 

Uploaded 179 documents


In [28]:
len(result)

179

In [36]:
query = "What is automated prompt engineering?"
count = 10
results = search_client.search(search_text=query, top=count, include_total_count=True)
page_chunks = []
for result in results:
    page_chunks.append(result['page_text'])
    

In [39]:
embed_df = pd.DataFrame(page_chunks, columns = ["chunks"]) #datframe with document chunks
embed_df

Unnamed: 0,chunks
0,A PROMPT ENGINEERING IN THE WILD Large models ...
1,arXiv:2211.01910v1 [cs.LG] 3 Nov 2022 LARGE LA...
2,AUTOPROMPT: Eliciting Knowledge from Language ...
3,Question Tracy used a piece of wire 4 feet lon...
4,Translation en-es Instruction Only In-context ...
5,Original Input Cinp a real joy. AUTOPROMPT Ipr...
6,p(y|xprompt) = 3 p([MASK] = w|xprompt) However...
7,Table 24: Few-shot exemplars for full chain of...
8,Task Prompt CSQA2 Generate some knowledge abou...
9,Task NumerSense Prompt Generate some numerical...


In [63]:
from openai.error import RateLimitError
from time import sleep


def get_embedding(text: str, engine: str = "text-embedding-ada-002"):
    count=0
    while True:
        try:
            embedding = openai.Embedding().create(input=[text], engine=engine)["data"][0]["embedding"]
            break;
        except RateLimitError:
            count+=1
            #print(f'RateLimitError Count: {count}')
            sleep(2)            
    return np.array(embedding).astype(np.float32)

def get_completion(prompt, model="gpt-35-turbo"): # Andrew mentioned that the prompt/ completion paradigm is preferable for this class
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        engine=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]


In [44]:
#Create an embedding vector for each chunk that will capture the semantic meaning and overall topic of that chunk
embed_df['embedding'] = embed_df["chunks"].apply(lambda page_text : get_embedding(page_text, engine = embedding_model))

RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 5
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 5
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 5
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 5


In [46]:
embed_df

Unnamed: 0,chunks,embedding
0,A PROMPT ENGINEERING IN THE WILD Large models ...,"[-0.024585616, -0.00022426769, 0.00631925, -0...."
1,arXiv:2211.01910v1 [cs.LG] 3 Nov 2022 LARGE LA...,"[-0.021359596, -0.007799931, -0.00046711107, -..."
2,AUTOPROMPT: Eliciting Knowledge from Language ...,"[-0.01975671, 0.0031453124, 0.0019787818, -0.0..."
3,Question Tracy used a piece of wire 4 feet lon...,"[0.007719166, 0.007380606, 0.0143549405, -0.03..."
4,Translation en-es Instruction Only In-context ...,"[-0.028066656, 0.0065712705, 0.016445307, -0.0..."
5,Original Input Cinp a real joy. AUTOPROMPT Ipr...,"[-0.031109842, -0.018427676, 0.0044843014, -0...."
6,p(y|xprompt) = 3 p([MASK] = w|xprompt) However...,"[-0.03274989, -0.011971078, -0.0063749263, -0...."
7,Table 24: Few-shot exemplars for full chain of...,"[0.013763685, 0.012710237, 0.03043296, -0.0045..."
8,Task Prompt CSQA2 Generate some knowledge abou...,"[0.017677374, 0.031847112, 0.037832364, -0.007..."
9,Task NumerSense Prompt Generate some numerical...,"[-0.0027567996, 0.012855793, 0.03290082, -0.00..."


In [50]:
query_embedding = get_embedding(query, engine=embedding_model)
embed_df["similarities"] = embed_df['embedding'].apply(lambda page_embedding: cosine_similarity(page_embedding, query_embedding))

top_results = (
    embed_df.sort_values("similarities", ascending=False)
    .reset_index(drop=True)
    .head(3)
)
top_results

Unnamed: 0,chunks,embedding,similarities
0,A PROMPT ENGINEERING IN THE WILD Large models ...,"[-0.024585616, -0.00022426769, 0.00631925, -0....",0.89053
1,arXiv:2211.01910v1 [cs.LG] 3 Nov 2022 LARGE LA...,"[-0.021359596, -0.007799931, -0.00046711107, -...",0.866601
2,AUTOPROMPT: Eliciting Knowledge from Language ...,"[-0.01975671, 0.0031453124, 0.0019787818, -0.0...",0.842


In [56]:
prompt = f"""
Provided below are user query and list of extracted pages from research papers separated by triple backticks.
Your task is to extract key pieces of information from that list based on the user query and phrase that as a comprehensive answer. 

User Query: ```{query}```
List of Extracted Pages: ```{top_results['chunks'].to_list()}```

Answer:
"""

print(prompt)


Provided below are user query and list of extracted pages from research papers separated by triple backticks.
Your task is to extract key pieces of information from that list based on the user query and phrase that as answer. 

User Query: ```What is automated prompt engineering?```
List of Extracted Pages: ```['A PROMPT ENGINEERING IN THE WILD Large models with natural language interfaces, including models for text generation and image synthesis, have seen an increasing amount of public usage in recent years. As finding the right prompt can be difficult for humans, a number of guides on prompt engineering as well as tools to aid in prompt discovery have been developed. Among others, see, for example: • https://blog.andrewcantino.com/blog/2021/04/21/prompt-engineering-tips-and-tricks/ • https://techcrunch.com/2022/07/29/a-startup-is-charging-1-99-for-strings-of-text-to-feed-to-dall-e-2/ • https://news.ycombinator.com/item?id=32943224 • https://promptomania.com/stable-diffusion-prompt-

In [57]:
response = get_completion(prompt)
print(response)

Automated prompt engineering is a method for automatically generating and selecting natural language instructions to steer large language models (LLMs) towards desired behaviors. It involves searching over a pool of instruction candidates proposed by an LLM in order to maximize a chosen score function. A number of guides on prompt engineering as well as tools to aid in prompt discovery have been developed. Some examples include: https://blog.andrewcantino.com/blog/2021/04/21/prompt-engineering-tips-and-tricks/, https://techcrunch.com/2022/07/29/a-startup-is-charging-1-99-for-strings-of-text-to-feed-to-dall-e-2/, https://news.ycombinator.com/item?id=32943224, https://promptomania.com/stable-diffusion-prompt-builder/, and https://huggingface.co/spaces/Gustavosta/MagicPrompt-Stable-Diffusion.


In [60]:

def query_search(query, count=10):
    results = search_client.search(search_text=query, top=count, include_total_count=True)
    page_chunks = []
    for result in results:
        page_chunks.append(result['page_text'])
        
    #Create an embedding vector for each chunk that will capture the semantic meaning and overall topic of that chunk
    embed_df['embedding'] = embed_df["chunks"].apply(lambda page_text : get_embedding(page_text, engine = embedding_model))

    query_embedding = get_embedding(query, engine=embedding_model)
    embed_df["similarities"] = embed_df['embedding'].apply(lambda page_embedding: cosine_similarity(page_embedding, query_embedding))

    top_results = (
        embed_df.sort_values("similarities", ascending=False)
        .reset_index(drop=True)
        .head(3)
    )
    
    prompt = f"""
    Provided below are user query and list of extracted pages from research papers separated by triple backticks.
    Your task is to extract key pieces of information from that list based on the user query and phrase that as a comprehensive answer. 

    User Query: ```{query}```
    List of Extracted Pages: ```{top_results['chunks'].to_list()}```

    Answer:
    """
    
    response = get_completion(prompt)
    return response

In [65]:
answer = query_search("How does automated prompt engineering work?", 5)
print(answer)

Automated prompt engineering involves using natural language instructions to steer large language models (LLMs) towards desired behaviors. This is achieved through the use of algorithms such as Automatic Prompt Engineer (APE) which generates and selects instructions automatically by searching over a pool of instruction candidates proposed by an LLM in order to maximize a chosen score function. The quality of the selected instruction is evaluated by evaluating the zero-shot performance of another LLM following the selected instruction. A number of guides on prompt engineering as well as tools to aid in prompt discovery have been developed to assist in finding the right prompt. These include resources such as https://blog.andrewcantino.com/blog/2021/04/21/prompt-engineering-tips-and-tricks/, https://techcrunch.com/2022/07/29/a-startup-is-charging-1-99-for-strings-of-text-to-feed-to-dall-e-2/, https://news.ycombinator.com/item?id=32943224, https://promptomania.com/stable-diffusion-prompt-

In [67]:
answer = query_search("what is prompt tuning?", 10)
print(answer)

Prompt tuning refers to the process of finding the right prompt for natural language interfaces, including models for text generation and image synthesis. It can be difficult for humans to find the right prompt, so a number of guides on prompt engineering as well as tools to aid in prompt discovery have been developed. AUTOPROMPT is an automated method for generating prompts for any task, based on a gradient-guided search. It creates a prompt by combining the original task inputs with a collection of trigger tokens according to a template. The same set of trigger tokens is used for all inputs, and is learned using a variant of the gradient-based search strategy. The LM predictions for the prompt are converted to class probabilities by marginalizing over a set of associated label tokens, which can either be learned or specified ahead of time, enabling the LM to be evaluated the same as one would any other classifier.
