## ALEPH
a PoC for conversational research environment with ADA and GPT3.5

Sources:
- How_to_format_inputs_to_ChatGPT_models: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
- Question_answering_using_embeddings: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

In [90]:
import pandas as pd
import numpy as np
import openai, pickle, tiktoken, ast

from pathlib import Path
from tqdm.notebook import tqdm

import re, sqlite3

from pdfminer.high_level import extract_text

from collections import defaultdict

from datetime import date, datetime

import pprint

import utils as utl
import params as prm

In [91]:
#Use your own key
# read secret key from secret file 
with open('secret.txt', 'r') as f:
    openai.api_key = f.read()

In [92]:
# Returns the current local date
today = date.today()
print("Today date is: ", today)

Today date is:  2023-03-16


In [93]:
session_name = 'good_incontext_examples_for_gpt3'
session_date = today

In [94]:
# Connect to user's database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database

utl.create_table(conn, prm.SESSION_TABLE_SQL) # Create table if it does not exist
# Create interaction table for the session
utl.create_table(conn, f"CREATE TABLE IF NOT EXISTS interaction_{session_name} (session_name, interaction_type, text, embedding, num_tokens_oai, time_signature)")


tables = utl.parse_tables(conn) # Grab tables to check if the table was created
conn.close()

In [96]:
# DB INTERACTION
# Write session data to the database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database
utl.insert_session(conn, session_name, session_date) # Insert session data  
conn.close()

Session: "good_incontext_examples_for_gpt3" inserted into the database


## 1. Prepare document for processing

In [97]:
# Logic for chapter capture
# digit_word_rgx = r'^\d+(\.\d+)*\s+[A-Z].*$|Abstract'
digit_word_rgx = r'^\d+(\.\d+)*\s[a-zA-Z]+.*$|Abstract'

In [98]:
# Load context document and grab chapter names
doc_name = 'What Makes Good InContext Examples for GPT3.pdf'

pdf_text = extract_text(prm.IN_PTH / doc_name)

# TODO: user should decide which split logic to use: random split (text be be cut into equal parts containing more-less the same number of tokens) or split by chapters (chapter inference is not always correct as pdfs are not always formated nicely). 
# TODO: Implement interface for split choice + equal chunking function
chapters = utl.grab_chapters(pdf_text, matching_logic=2)

In [99]:
# See what chapters got captured from the pdf
chapters

['Abstract',
 '3 keeps generating tokens until there is a special char-',
 '2 Method',
 '2.1 GPT-3 for In-Context Learning',
 '2.2 The Impact of In-Context Examples',
 '2.3 kNN-augmented In-Context Example',
 '3 Experimental Setup',
 '3.1 Sentence Embeddings for Retrieval',
 '3 on open-domain QA tasks. The EM score is',
 '3.2 Baseline Methods',
 '4 Experimental Results',
 '4.1 Sentiment Analysis',
 '2 dataset, the accuracy of kNNsst-2 is 92.46, which',
 '4.2 Table-to-text Generation',
 '4.3 Questing Answering',
 '64 nearest neighbors (10 for TriviaQA) to deter-',
 '3 to misinterpret the question as asking for a spe-',
 '64 examples.',
 '5 Analysis and Ablation Study',
 '5.1 Number of In-context Examples',
 '5.2 Size of Training Set for Retrieval',
 '3 to better answer the questions.',
 '5.3 Order of In-context Examples',
 '6 Related Work',
 '7 Conclusion']

In [100]:
# TODO: interface for selecting chapters to be used for text fragmentation. 
chapters = [
    'Abstract',
    '1 Introduction',
    '2 Method',
    '2.1 GPT-3 for In-Context Learning',
    '2.2 The Impact of In-Context Examples',
    '2.3 kNN-augmented In-Context Example',
    '3 Experimental Setup',
    '3.1 Sentence Embeddings for Retrieval',
    '3.2 Baseline Methods',
    '4 Experimental Results',
    '4.1 Sentiment Analysis',
    '4.2 Table-to-text Generation',
    '4.3 Questing Answering',
    '5 Analysis and Ablation Study',
    '5.1 Number of In-context Examples',
    '5.2 Size of Training Set for Retrieval',
    '5.3 Order of In-context Examples',
    '6 Related Work',
    '7 Conclusion'
]
chapters = [x.lower().strip() for x in chapters]
print(f"There are {len(chapters)} chapters in the document")


# Set a specific starting point of the document
# start = 'abstract'
start = chapters[0]

start = start.lower().strip()

There are 19 chapters in the document


In [101]:
# TODO: make this text pre-processing function
# define the text to be parsed
text = [] 
for line in pdf_text.split('\n'):
    # if len(line)<=1: #get rid of junk lines
    #     continue
    line = line.replace('\t', ' ')
    line = line.strip().lower()
    text.append(line)

text = ' '.join(text)
# replace newline and tab characters with space
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub(r'\s+', ' ', text)

In [102]:
# TODO: turn it into a function that will get triggered whe user decides to chop the document by chapter names

# Fragment the text according to the logic the user defined (currently - by chapters)

# join the end strings with | to form chapter end regex pattern
end_pattern = "|".join(chapters)

# match text between chapter and any end string
chapters_contents = {}
for string in chapters:

    pattern = rf"{string}(.*?)(" + end_pattern + "|$)"
    pattern = re.compile(pattern)



    # search for the pattern in the text
    match = pattern.search(text)

    # if there is a match, extract the text between string and any end-string
    if match:
        # get the first group of the match object, which is the text between given chapter and any end string
        result = match.group(1)

        # print or save or do whatever you want with the result
        chapters_contents[string] = result

In [103]:
#TODO: come up with test that checks if I grabbed all the chapters
fetched_chapters = [x for x in chapters_contents.keys()] 
# compare element wise fetched_chapters with chapters
[x for x in chapters if x not in fetched_chapters]

[]

In [104]:
# Manually inspect some chapters
chapters_contents['1 introduction']

' gpt-3 (brown et al., 2020) is a new breakthrough in nlp research. previously, nlp models are pre- trained on large quantities of data and ﬁne-tuned ∗work was done during an internship at microsoft dy- namics 365 ai. trial accuracy 1 94.6 2 95.0 3 95.8 4 93.9 5 86.9 table 1: results of gpt-3 on the task of sentiment analysis on the sst-2 dataset. five different in-context examples are randomly selected from the training set. we observe different contexts induce different accura- cies on the test set. on a speciﬁc task and dataset. what sets gpt-3 apart from other pre-trained language models is its impressive “in-context” few-shot learning ability. provided with a few in-context examples, gpt-3 is able to generalize to unseen cases without fur- ther ﬁne-tuning. this opens up many new tech- nological possibilities that are previously consid- ered unique to human. for example, nlp systems can be developed to expand emails, extract entities from text, generate code based on natural langua

In [105]:
# Grab contents into a dataframe
chapter_contents_df = pd.DataFrame(chapters_contents, index=['contents'])
chapter_contents_df = chapter_contents_df.T
chapter_contents_df

Unnamed: 0,contents
abstract,"gpt-3 (brown et al., 2020) has attracted lots..."
1 introduction,"gpt-3 (brown et al., 2020) is a new breakthro..."
2 method,
2.1 gpt-3 for in-context learning,the in-context learning scenario of gpt-3 can...
2.2 the impact of in-context examples,given the observation that the empirical resu...
2.3 knn-augmented in-context example,"selection based on the ﬁndings above, we prop..."
3 experimental setup,we apply the knn in-context selection method ...
3.1 sentence embeddings for retrieval,to retrieve semantically-similar training ins...
3.2 baseline methods,"random sampling for each test sentence, we ra..."
4 experimental results,


In [106]:
# Create a token count column
chapter_contents_df['num_tokens_oai'] = chapter_contents_df['contents'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

In [107]:
# For instances with token count > token_thres, split them so they fit model threshold so we could get their embeddings
# TODO: make it split actually by tokens, not by characters

token_thres = 500

chapter_contents_df['split_factor'] = 1
chapter_contents_df.loc[chapter_contents_df['num_tokens_oai'] > token_thres, 'split_factor'] = round(chapter_contents_df['num_tokens_oai']/token_thres,0)


chapter_contents_df['contents_split'] = chapter_contents_df.apply(
    lambda x: utl.split_contents(x), axis=1)


chapter_contents_long_df = chapter_contents_df.explode(
    column='contents_split'
)[['contents_split']]

In [108]:
# Create a token count column
chapter_contents_long_df['num_tokens_oai'] = chapter_contents_long_df['contents_split'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

chapter_contents_long_df

Unnamed: 0,contents_split,num_tokens_oai
abstract,"gpt-3 (brown et al., 2020) has attracted lots...",338
1 introduction,"gpt-3 (brown et al., 2020) is a new breakthro...",549
1 introduction,he recent success of retrieval-augmented model...,511
2 method,,7
2.1 gpt-3 for in-context learning,the in-context learning scenario of gpt-3 can...,489
2.2 the impact of in-context examples,given the observation that the empirical resu...,377
2.3 knn-augmented in-context example,"selection based on the ﬁndings above, we prop...",415
2.3 knn-augmented in-context example,number of in-context examples k (hyperparamete...,425
3 experimental setup,we apply the knn in-context selection method ...,246
3.1 sentence embeddings for retrieval,to retrieve semantically-similar training ins...,440


In [109]:
chapter_contents_long_df['text'] = "CHAPTER: " +  chapter_contents_long_df.index + " CONTENT: " + chapter_contents_long_df['contents_split']

In [110]:
# Drop contents_split column
chapter_contents_long_df = chapter_contents_long_df.drop(columns=['contents_split'])
# Reset index so chapter names are stored in columns
chapter_contents_long_df = chapter_contents_long_df.reset_index()
# Rename index column to chapter
chapter_contents_long_df = chapter_contents_long_df.rename(columns={'index': 'chapter'})
# Add session_name column
chapter_contents_long_df['session_name'] = session_name
## Add interaction type column
chapter_contents_long_df['interaction_type'] = 'source'
## Drop rows where num_tokens_oai is less than 100
chapter_contents_long_df = chapter_contents_long_df[chapter_contents_long_df['num_tokens_oai'] > 50].copy()

In [111]:
chapter_contents_long_df

Unnamed: 0,chapter,num_tokens_oai,text,session_name,interaction_type
0,abstract,338,CHAPTER: abstract CONTENT: gpt-3 (brown et al...,good_incontext_examples_for_gpt3,source
1,1 introduction,549,CHAPTER: 1 introduction CONTENT: gpt-3 (brown...,good_incontext_examples_for_gpt3,source
2,1 introduction,511,CHAPTER: 1 introduction CONTENT: he recent suc...,good_incontext_examples_for_gpt3,source
4,2.1 gpt-3 for in-context learning,489,CHAPTER: 2.1 gpt-3 for in-context learning CON...,good_incontext_examples_for_gpt3,source
5,2.2 the impact of in-context examples,377,CHAPTER: 2.2 the impact of in-context examples...,good_incontext_examples_for_gpt3,source
6,2.3 knn-augmented in-context example,415,CHAPTER: 2.3 knn-augmented in-context example ...,good_incontext_examples_for_gpt3,source
7,2.3 knn-augmented in-context example,425,CHAPTER: 2.3 knn-augmented in-context example ...,good_incontext_examples_for_gpt3,source
8,3 experimental setup,246,CHAPTER: 3 experimental setup CONTENT: we app...,good_incontext_examples_for_gpt3,source
9,3.1 sentence embeddings for retrieval,440,CHAPTER: 3.1 sentence embeddings for retrieval...,good_incontext_examples_for_gpt3,source
10,3.1 sentence embeddings for retrieval,436,CHAPTER: 3.1 sentence embeddings for retrieval...,good_incontext_examples_for_gpt3,source


## 2. Get embeddings for each content piece

In [112]:
contents_for_embed_df = chapter_contents_long_df[['text']]

In [113]:
# Calculate the cost of running the model to get embeddings
(chapter_contents_long_df['num_tokens_oai'].sum() / 1000) * 0.0004

0.0039588

In [114]:
# Embed each chapter
rng = tqdm(range(0,len(contents_for_embed_df)))

contents_embedded = {}

for i in rng:
    txt_chapter = contents_for_embed_df.index[i]
    txt_list = contents_for_embed_df.iloc[i].to_list()

    txt_embed = utl.get_embedding(txt_list)



    # Join embeddings with context table
    contents_embedded[txt_chapter] = txt_embed
embeded_s = pd.Series(contents_embedded, index=contents_embedded.keys())

# Merge embeddings with chapter contents
chapter_contents_long_df['embedding'] = embeded_s
chapter_contents_long_df.head()

# Save embeddings
chapter_contents_long_df.to_csv(f'./data/{session_name}_embeded.csv')

## 3. Find most similar document embeddings to the question embedding

In [115]:
# Read embeddings
chapter_contents_long_df = pd.read_csv(prm.D_PTH  / f'{session_name}_embeded.csv', index_col=0)

In [116]:
# DB INTERACTION
# Insert context into DB
conn = utl.create_connection(prm.DB_PTH)
utl.insert_context(conn, session_name, chapter_contents_long_df)
conn.close()

Context table for session: "good_incontext_examples_for_gpt3" created


In [127]:
# Buiild seed conversation context for summarization
# TODO: no need to get embedding each time the script is run. Optimize it.

summary_ctx_usr = "How would you act when I'd ask you what this document is about. Can you summarize it for me?"

summary_ctxt_asst = "When a user asks me to summarize the source material or explain what it is about, I would look for the best text fragment that provides general information about the document's contents. To find a text fragment for summarization, I suggest starting by scanning the abstract and conclusion sections, and also checking the table of contents."

utl.bulk_insert_interaction(conn, summary_ctx_usr, summary_ctxt_asst, session_name)

Interaction type: "user" inserted into the database for session: "good_incontext_examples_for_gpt3"
Interaction type: "assistant" inserted into the database for session: "good_incontext_examples_for_gpt3"


In [129]:
# POC - conversational interface

## Define how many samples you want to get from the recall table
num_samples = 5
## Get unix time
query_time = int(datetime.now().timestamp())
## User input
question = input(">>> ")
# question = prmt


## Fetch recal table so we can compare user input to embeddings saved in it and fetch the right context.
recal_table = utl.fetch_recall_table(session_name)

## Chop recall table to only include contexts for sources, user, or assistant
recal_table_source = recal_table[recal_table['interaction_type'] == 'source']
recal_table_user = recal_table[recal_table['interaction_type'] == 'user']
recal_table_assistant = recal_table[recal_table['interaction_type'] == 'assistant']

recal_embed_source = utl.convert_table_to_dct(recal_table_source)
recal_embed_user = utl.convert_table_to_dct(recal_table_user)
recal_embed_assistant = utl.convert_table_to_dct(recal_table_assistant)

## Get the context from recal table that is the most similar to user input
if recal_table_source.shape[0]<num_samples:
    num_samples = recal_table_source.shape[0]
    print('Source material is shorter than number of samples you want to get. Setting number of samples to the number of source material sections.')

## Get SRC context
if len(recal_embed_source) == 0:
    recal_source = 'No context found'
else:
    recal_source_id = utl.order_document_sections_by_query_similarity(question, recal_embed_source)[0:num_samples]
    # If recal source id is a list, join the text from the list
    if len(recal_source_id)>1:
        idxs = [x[1] for x in recal_source_id]
        recal_source = recal_table.loc[idxs]['text'].to_list()
        recal_source = '| '.join(recal_source)
    else: 
        recal_source = recal_table.loc[recal_source_id[1]]['text']
## GET QRY context
if len(recal_embed_user) == 0:
    recal_user = 'No context found'
else:
    recal_user_id = utl.order_document_sections_by_query_similarity(question, recal_embed_user)[0][1]
    recal_user = recal_table.loc[recal_user_id]['text']
## GET RPL context
if len(recal_embed_assistant) == 0:
    recal_assistant = 'No context found'
else:
    recal_assistant_id = utl.order_document_sections_by_query_similarity(question, recal_embed_assistant)[0][1]
    recal_assistant = recal_table.loc[recal_assistant_id]['text']


# Look for assistant and user messages in the interaction table that have the latest time_signature
last_usr_max = recal_table_user['time_signature'].astype(int).max()
last_asst_max = recal_table_assistant['time_signature'].astype(int).max()
if last_usr_max == 0:
    latest_user = 'No context found'
else:
    latest_user = recal_table_user.loc[recal_table_user['time_signature']==str(last_usr_max)]['text'].values[0]

if last_asst_max == 0:
    latest_assistant = 'No context found'
else:
    latest_assistant = recal_table_assistant.loc[recal_table_assistant['time_signature']==str(last_asst_max)]['text'].values[0]


## Grab chapter name if it exists, otherwise use session name
## It will become handy when user wants to know from which chapter the context was taken
if len(recal_source_id)>1:
    recal_source_chapter = recal_table.loc[idxs]['chapter'].to_list()
else:
    recal_source_chapter = recal_table.loc[recal_source_id[1]]['chapter']

print(f'I will answer your question basing on the following context: {set(recal_source_chapter)}')

###############################################################################
# Set-up system prompts. This is done once for the whole session and the setup depends on the type of assistant chosen by the user.
# I comnment this out as this needs to be saved in the interaction table and I will implement this after user-assistant interactions are implemented.

sys_message = {
        'role': 'system', 
        'content': "You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it. I provide my input after INP tag. I will pass the context you will use in your answer. I encode it with following tags: SRC - sources we are talking about; QRY - one of previous inputs I passed to you in the conversation; RPL - one of your previous replies to my questions from the conversation."
        }
prev_user = {"role": "user", "content": f"{latest_user}"}
prev_assistant = {"role": "assistant", "content": f"{latest_assistant}"}
user_message = {
        "role": "user", 
        "content": f"SRC: {recal_source}. QRY: {recal_user}. RPL: {recal_assistant}. INP: {question}"
        }

###############################################################################

## Form user message based on recaled context and user's input
usr_message = [
    sys_message,
    prev_user,
    prev_assistant,
    user_message
    ]

# Count number of tokens in user message and display it to the user
token_passed = utl.num_tokens_from_messages(usr_message)
context_capacity =  4096 - token_passed
print(f"Number of tokens passed to the model: {token_passed}")
print(f"Number of tokens left in the context: {context_capacity}")

# Grab call user content from messages alias
usr_message_content = usr_message[0]['content']


## Make API call with formed user message
api_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=usr_message
)


# Open DB so the assistant can remember the conversation
conn = utl.create_connection(prm.DB_PTH)
# Insert user message into DB so we can use it for another user's input
utl.insert_interaction(
    conn, 
    session_name, 
    'user', 
    question,
    query_time
    )
# Insert model's response into DB so we can use it for another user's input
utl.insert_interaction(
    conn,
    session_name,
    'assistant',
    api_response['choices'][0]['message']['content'],
    api_response['created']
    )
conn.close()




## Print CALL and RESPONSE
pprint.pprint('USER:')
pprint.pprint(question)
pprint.pprint('---------------------------------')
pprint.pprint('ASSISTANT:')
pprint.pprint(api_response['choices'][0]['message']['content'])
## Print additional context data:
pprint.pprint('---------------------------------')
pprint.pprint('CONTEXT:')
pprint.pprint(f"SOURCE: {recal_source}")
pprint.pprint(f"USER: {recal_user}")
pprint.pprint(f"ASSISTANT: {recal_assistant}")


I will answer your question basing on the following context: {'3.1 sentence embeddings for retrieval', '3.2 baseline methods', '2.1 gpt-3 for in-context learning', 'abstract', '1 introduction'}
Number of tokens passed to the model: 2618
Number of tokens left in the context: 1478
Interaction type: "user" inserted into the database for session: "good_incontext_examples_for_gpt3"
Interaction type: "assistant" inserted into the database for session: "good_incontext_examples_for_gpt3"
'USER:'
'tell me what zero shot learning is? '
'---------------------------------'
'ASSISTANT:'
('I apologize, but the information you are requesting is not directly related '
 'to the context of the sources provided. However, zero-shot learning refers '
 'to the ability of a machine learning model to perform a task without any '
 'training examples specifically for that task. Instead, the model uses its '
 'prior knowledge to complete the task based on a few examples or a general '
 'understanding of the prob