# iLykei Lecture Series   
# Generative AI App Client   

### Y. Balasanov, A. Kobyshev, M. Tselishchev, &copy; iLykei 2023

In [1]:
# Install necessary libs & compile proto-files
!protoc --python_out=./ *.proto

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from scipy.spatial.distance import cdist
import openai
from openai import OpenAI
import os
import json
import numpy as np
import time

### Load PDF

In [2]:
# Load document
loader = PyPDFLoader("./Generative_AI_App_Doc.pdf")
data = loader.load()

### Clean Document

In [3]:
# Split document into smaller texts
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=50,
    length_function=len, 
    add_start_index=True
    )
texts = text_splitter.split_documents(data)

### Get Embeddings

In [4]:
# Setup OpenAI API Key
with open("OPENAI_KEY.txt",'r') as f:
    openai_api_key = f.readline().strip()
openai.api_key = openai_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key

In [5]:
# Initialize Embeddings Model
embeddings_model = OpenAIEmbeddings(disallowed_special=())

### Cache

In [6]:
# Cache Functions
def save_cache_to_file(cache, filename='embedding_cache.json'):
    with open(filename, 'w') as file:
        json.dump(cache, file)

def load_cache_from_file(filename='embedding_cache.json'):
    try:
        with open(filename, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        return {}

def get_embeddings(texts, embeddings_model, cache):
    embeddings = []
    for text in texts:
        text_hash = hash(text)
        if text_hash in cache:
            # Directly use the cached embedding
            embedding = cache[text_hash]
        else:
            # Compute embedding and add to cache
            embedding = embeddings_model.embed_documents([text])[0]
            # Since embedding is already a list, directly add it to the cache
            cache[text_hash] = embedding
        embeddings.append(embedding)
    return embeddings

In [7]:
# Load existing cache and get embeddings
embedding_cache = load_cache_from_file()
embeddings = get_embeddings([doc.page_content for doc in texts], embeddings_model, embedding_cache)
# save_cache_to_file(embedding_cache)

In [8]:
# Convert embeddings to list format if necessary
embeddings_as_lists = [embedding.tolist() if isinstance(embedding, np.ndarray) else embedding for embedding in embeddings]

# Ensure 'text_contents' and 'my_OpenAI_key' are defined in the global scope
text_contents = [doc.page_content for doc in texts]

Define the event handler responding to test questions.

In [9]:
from scipy.spatial.distance import cdist

def find_similar_documents(question_embedding, embeddings, k=3):
    similarities = cdist([question_embedding], embeddings, 'cosine')[0]
    most_similar_indices = similarities.argsort()[:k]
    return most_similar_indices

def create_qa_prompt(content):
    instructions = f"""
    Please generate a SUCCINCT answer based on the context you are given. 
    Please answer as if you are answering a test question. 
    Please answer the question as it is asked, if it requires technicalities, please use them.

    Here is the content of the document: {content}
    """
    return instructions

client = OpenAI()
def get_openai_qa_response(prompt):

    # Sleep so api doesn't hang
    time.sleep(1)

    try:
        # Get the response from OpenAI
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            # model="gpt-4-1106-preview",
            messages=[{"role": "system", "content": "Do EXACTLY as the instructions in the prompt say."},
                      {"role": "user", "content": prompt}],
            temperature=0,
            
        )

        return response.choices[0].message.content
    except Exception as e:
        print(f"Error in getting response: {e}")
        return None

In [10]:
import openai

def question_handler(question_id, question):
    global embeddings_model, embeddings_as_lists, text_contents

    print(f'{question_id}) Q: {question}')

    # Generate embedding for the enhanced question
    question_embedding = embeddings_model.embed_documents([question])[0]

    # Find indices of k most similar documents
    similar_docs_indices = find_similar_documents(question_embedding, embeddings_as_lists, k=1)

    # Retrieve the texts of the similar documents
    context_texts = [text_contents[idx] for idx in similar_docs_indices]

    # Combine context texts into a single string
    combined_context = "\n".join(context_texts)

    # Create a QA prompt based on the combined context
    qa_prompt = create_qa_prompt(combined_context)

    # Get the Q&A response from OpenAI
    qa_response = get_openai_qa_response(qa_prompt)

    # Print and return the response
    print(f'A: {qa_response}\n')
    return qa_response

Instead of the line `answer = "I don't know"` insert the code denerating the answer by your app.

In [11]:
from Generative_AI_App_connection import connect

with open("my_credentials.txt",'r') as f:
    lines = f.readlines()
login, password = map(str.strip, lines)

# server options
host = 'datastream.ilykei.com' # do not change
port = 30095   # do not change
stream_name = 'Generative_AI_App'   # do not change
catch_handler_errors = False  # we recommend using TRUE during the test and FALSE during workshop

# make connection with your handler
result = connect(host, port, login, password, stream_name,
                 question_handler, catch_handler_errors)

Connecting to datastream.ilykei.com:30095
Sending login message
Logged in successfully as  samuelswain2023@u.northwestern.edu
0) Q: What is the specific cellular pathway targeted by PancreXcel in combatting pancreatic cancer?
[0.08825718 0.09524833 0.09886445 0.12005709 0.12662714 0.10782266
 0.10922715 0.15196166 0.22523841 0.12412656 0.13257508 0.1640101
 0.17311522 0.17965879 0.22264847 0.15054686 0.15366384 0.14284379
 0.14638702 0.1721993  0.17312325 0.18662241 0.18476963 0.17832097
 0.12003877 0.17644506 0.15382626 0.27926297 0.26621417 0.14933377
 0.10905529 0.14747703 0.14996883 0.17984852 0.14755263]
A: The fusion protein in PancreXcel is designed to target the TGF-beta receptor, which is overproduced in pancreatic cancer. This allows PancreXcel to combat the disease by inhibiting the tumor's growth and evading the immune system.

1) Q: How does PancreXcel disrupt the TGF-beta signaling pathway in pancreatic cancer?
[0.10215161 0.08469624 0.11176082 0.14822202 0.15778702 0.125

In [None]:
# check results
result

{'problems': [], 'n_signals': 10, 'penalty': 28, 'score': 72}