In [1]:
# Test that your OpenAI API key is correctly set as an environment variable
# Note. if you run this notebook locally, you will need to reload your terminal and the notebook for the env variables to be live.
import os, json
import openai
import numpy as np

from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = os.environ['OPENAI_API_VERSION']

text_model = os.environ['TEXT_DAVINCI_NAME']
chat_model = os.environ['CHAT_MODEL_NAME']
embedding_model=os.environ['EMBEDDING_MODEL_NAME']

In [2]:
# -- raw data
RAW_DATA_FOLDER= '../data/unstructured/raw'
# -- extracted json file 
EXTRACTED_DATA_FOLDER = '../data/unstructured/extracted'

In [3]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [4]:
def extract_local_single_file(file_name: str):
    not_completed = True
    while not_completed:
        with open(file_name, "rb") as f:
            poller = document_analysis_client.begin_analyze_document(
                "prebuilt-layout", document=f
            )
            not_completed=False
    result = poller.result()
    return get_page_content(file_name, result)

def extract_files( folder_name: str, destination_folder_name: str):
    os.makedirs(destination_folder_name, exist_ok=True)
    for file in os.listdir(folder_name):
        if file[-3:].upper() in ['PDF','JPG','PNG']:
            print('Processing file:', file, end='')
        
            page_content = extract_local_single_file(os.path.join(folder_name, file))
            output_file = os.path.join(destination_folder_name, file[:-3] +'json')
            print(f'  write output to {output_file}')
            with open(output_file, "w") as f:
                f.write(json.dumps(page_content))


def get_page_content(file_name:str, result):
    page_content = []
    for page in result.pages:
        all_lines_content = []
        for line_idx, line in enumerate(page.lines):
            all_lines_content.append(' '.join([word.content for word in line.get_words()]))
        page_content.append({'page_number':page.page_number, 
                                'page_content':' '.join(all_lines_content)})
    return {'filename':file_name, 'content':page_content}


In [5]:
extract_files(RAW_DATA_FOLDER, EXTRACTED_DATA_FOLDER)

Processing file: 2010.15980.pdf  write output to ../data/unstructured/extracted\2010.15980.json
Processing file: 2101.00190.pdf  write output to ../data/unstructured/extracted\2101.00190.json
Processing file: 2104.08691.pdf  write output to ../data/unstructured/extracted\2104.08691.json
Processing file: 2110.08387.pdf  write output to ../data/unstructured/extracted\2110.08387.json
Processing file: 2201.11903.pdf  write output to ../data/unstructured/extracted\2201.11903.json
Processing file: 2203.11171.pdf  write output to ../data/unstructured/extracted\2203.11171.json
Processing file: 2211.01910.pdf  write output to ../data/unstructured/extracted\2211.01910.json
Processing file: 2212.10496.pdf  write output to ../data/unstructured/extracted\2212.10496.json


In [3]:
from redis import Redis
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    TagField,
    VectorField
)

host=os.environ['REDIS_HOST']
port=os.environ['REDIS_PORT']
username=os.environ['REDIS_USERNAME']
password=os.environ['REDIS_PASSWORD']
ssl=True if os.getenv('REDIS_SSL').upper() == 'TRUE' else False
if host=='localhost':
    redis_client = Redis(host = host, port = port)
else:
    redis_client = Redis(host = host, port = port, username=username, password=password, ssl=ssl)
    
VECTOR_FIELD_NAME = 'vector'
VECTOR_NUMBER = 10000
VECTOR_DIM = 4096
DISTANCE_METRIC = 'COSINE'
INDEX_NAME = "research-papers-index"

redis_client.ping()


True

In [4]:
from openai.error import RateLimitError
from time import sleep


def get_embedding(text: str, engine: str, tobytes=True):
    count=0
    while True:
        try:
            embedding = openai.Embedding().create(input=[text], engine=engine)["data"][0]["embedding"]
            break;
        except RateLimitError:
            count+=1
            print(f'RateLimitError Count: {count}')
            sleep(2)
    if tobytes:
        return np.array(embedding).astype(np.float32).tobytes()
    else:
        return embedding

In [5]:
# display key names for each page to index that page content
for file in os.listdir(EXTRACTED_DATA_FOLDER):
    with open(os.path.join(EXTRACTED_DATA_FOLDER, file)) as f:
        page_content= json.loads(f.read())
    for page in page_content['content']:
        print(page_content['filename'].split('/')[-1] + '-' + str(page['page_number']))
        

raw\2010.15980.pdf-1
raw\2010.15980.pdf-2
raw\2010.15980.pdf-3
raw\2010.15980.pdf-4
raw\2010.15980.pdf-5
raw\2010.15980.pdf-6
raw\2010.15980.pdf-7
raw\2010.15980.pdf-8
raw\2010.15980.pdf-9
raw\2010.15980.pdf-10
raw\2010.15980.pdf-11
raw\2010.15980.pdf-12
raw\2010.15980.pdf-13
raw\2010.15980.pdf-14
raw\2010.15980.pdf-15
raw\2101.00190.pdf-1
raw\2101.00190.pdf-2
raw\2101.00190.pdf-3
raw\2101.00190.pdf-4
raw\2101.00190.pdf-5
raw\2101.00190.pdf-6
raw\2101.00190.pdf-7
raw\2101.00190.pdf-8
raw\2101.00190.pdf-9
raw\2101.00190.pdf-10
raw\2101.00190.pdf-11
raw\2101.00190.pdf-12
raw\2101.00190.pdf-13
raw\2101.00190.pdf-14
raw\2101.00190.pdf-15
raw\2104.08691.pdf-1
raw\2104.08691.pdf-2
raw\2104.08691.pdf-3
raw\2104.08691.pdf-4
raw\2104.08691.pdf-5
raw\2104.08691.pdf-6
raw\2104.08691.pdf-7
raw\2104.08691.pdf-8
raw\2104.08691.pdf-9
raw\2104.08691.pdf-10
raw\2104.08691.pdf-11
raw\2104.08691.pdf-12
raw\2104.08691.pdf-13
raw\2104.08691.pdf-14
raw\2104.08691.pdf-15
raw\2110.08387.pdf-1
raw\2110.08387.p

In [6]:
# load documents into redis
documents=[]
use_redis = True
verbose = True
   
tobytes = True if use_redis else False

for file in os.listdir(EXTRACTED_DATA_FOLDER):
    if verbose:
        print('\n\nLoading file:', file)
    with open(os.path.join(EXTRACTED_DATA_FOLDER, file)) as f:
        page_content= json.loads(f.read())
    documents.extend([{'document_key':page_content['filename'].split('/')[-1] + '-' + str(page['page_number']),\
                        'document_name':page_content['filename'].split('/')[-1],\
                        'file_path':page_content['filename'],\
                        'page_number': page['page_number'],\
                        'page_text': page['page_content'],\
                        VECTOR_FIELD_NAME: get_embedding(page['page_content'], \
                                                                    embedding_model, \
                                                                    tobytes=tobytes) } \
                        for page in page_content['content'] ])



Loading file: 2010.15980.json
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 5
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4


Loading file: 2101.00190.json
RateLimitError Count: 1
RateLimitError Count: 1
RateLimitError Count: 2
RateLimitError Count: 3
RateLimitError Count: 4


In [7]:
# Json example of each page of each research paper file 
documents[0]

{'document_key': 'raw\\2010.15980.pdf-1',
 'document_name': 'raw\\2010.15980.pdf',
 'file_path': '../data/unstructured/raw\\2010.15980.pdf',
 'page_number': 1,
 'page_text': 'AUTOPROMPT: Eliciting Knowledge from Language Models with Automatically Generated Prompts Taylor Shin*♦ Yasaman Razeghi∗♦ Eric Wallace♠ ♦University of California, Irvine Robert L. Logan IV∗♦ Sameer Singh♦ ♠University of California, Berkeley {tshin1, yrazeghi, rlogan, sameer}@uci.edu ericwallace@berkeley.edu arXiv:2010.15980v2 [cs.CL] 7 Nov 2020 Abstract The remarkable success of pretrained lan- guage models has motivated the study of what kinds of knowledge these models learn dur- ing pretraining. Reformulating tasks as fill- in-the-blanks problems (e.g., cloze tests) is a natural approach for gauging such knowledge, however, its usage is limited by the manual effort and guesswork required to write suit- able prompts. To address this, we develop AUTOPROMPT, an automated method to cre- ate prompts for a diverse set

## Flat Index

In [32]:
#redis_client.flushall()

# later we will create HNSW index
flat_index_name = INDEX_NAME + '_flat'

# Check if index exists
try:
    redis_client.ft(flat_index_name).info()
    print(f"Index: {flat_index_name} already exists")
except:
    # Create RediSearch Index
    redis_client.ft(flat_index_name).create_index([
        VectorField(VECTOR_FIELD_NAME,
            "FLAT", {
                "TYPE": "FLOAT32",
                "DIM": VECTOR_DIM, 
                "DISTANCE_METRIC": DISTANCE_METRIC,
                "INITIAL_CAP": VECTOR_NUMBER,
                "BLOCK_SIZE":VECTOR_NUMBER,
            }
        ),
        TagField("document_name"),
        TagField("page_number"),
        TextField("page_text")
    ])


In [36]:
for page in documents:
    #hash key
    key="document:"+page['document_key']
    # HSET
    redis_client.hset(key,mapping=page)
    
print(f"Loaded {redis_client.info()['db0']['keys']} documents in Redis search index with name: {INDEX_NAME}")
        

        


Loaded 179 documents in Redis search index with name: research-papers-index


In [33]:
# loading vectors (of each page) as key value pairs

pipeline = redis_client.pipeline(transaction=False)
for page in documents:
    #hash key
    key="document:"+page['document_key']
    # HSET
    pipeline.hset(key,mapping=page)
    
pipeline.execute()
print(f"Loaded {redis_client.info()['db0']['keys']} documents in Redis search index with name: {flat_index_name}")

Loaded 179 documents in Redis search index with name: research-papers-index_flat


In [37]:
redis_client.ft(flat_index_name).dropindex()

b'OK'

In [35]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.docstore.document import Document

query_text = "what is automated prompt engineering?"
topK = 2

query_vector = get_embedding(query_text, engine=embedding_model, tobytes=True)

base_query = f"*=>[KNN {topK} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]"
query = (
    Query(base_query)
        .sort_by("vector_score")
        .paging(0, topK)
        .return_fields('vector_score','document_name','page_number','page_text')
        .dialect(2)
)
params_dict = {"vec_param": query_vector}

#Execute the query
query_results = redis_client.ft(flat_index_name).search(query, query_params = params_dict)
print(query_results)

for i, article in enumerate(query_results.docs):
    score = 1 - float(article.vector_score)
    print(f"{i}. {article.title} (Score: {round(score ,3) })")
    print(f"\t{article.content[:300]}...\n")



Result{0 total, docs: []}


In [65]:
def convert_redis_query_result_to_document(query_results):
    docs = []
    for r in query_results:
        source = r.document_name + ":"+str(r.page_number)
        docs.append(Document(page_content=r.page_text,lookup_str="", metadata={"source":source}))
    return docs  

def query(query_text:str, index_name, topK=5):
    query_vector = get_embedding(query_text, engine=embedding_model)
    #prepare the query
    q = Query(f'*=>[KNN {topK} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').\
                sort_by('vector_score').\
                paging(0,topK).\
                return_fields('vector_score','document_name','page_number','page_text').\
                dialect(2)
    params_dict = {"vec_param": query_vector}

    #Execute the query
    query_results = redis_client.ft(index_name).search(q, query_params = params_dict)
    print(query_results)
    
    # Convert results to Document type in langchain
    docs = convert_redis_query_result_to_document(query_results.docs)
    print(docs)
    
    text_llm = AzureOpenAI(temperature=0.0, deployment_name=text_model)
    
    chat_llm = AzureChatOpenAI(temperature=0.0,
                          openai_api_base=openai.api_base,
                          openai_api_version=openai.api_version,
                          deployment_name=chat_model,
                          openai_api_key=openai.api_key,
                          openai_api_type = openai.api_type)

    # Call qna_source_chain
    chain = load_qa_with_sources_chain(text_llm,
                                       chain_type="map_rerank",
                                       metadata_keys=['source'], 
                                       return_intermediate_steps=True)
    results = chain({"input_documents": docs, "question": query_text}, return_only_outputs=False)
    #Print similar products found
    # for r in results.docs:
    #     print("Score: ", r.vector_score, " "* 5, "Document Name: ", r.document_name, " "*5, "Page Number: ", r.page_number)
    #     print("  Text: ", r.page_text)
    return results

In [66]:
query("what is automated prompt engineering?", flat_index_name, topK=2)

Result{0 total, docs: []}
[]


IndexError: list index out of range