## ALEPH
a PoC for conversational research environment with ADA and GPT3.5

Sources:
- How_to_format_inputs_to_ChatGPT_models: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
- Question_answering_using_embeddings: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

In [56]:
# TODO: come up with complexity control of the API call (count the number of tokens, manage contexts and so on)
# TODO: implement chat memory (sqlite3)

In [57]:
import pandas as pd
import numpy as np
import openai, pickle, tiktoken, ast

from pathlib import Path
from tqdm.notebook import tqdm

import re, sqlite3

from pdfminer.high_level import extract_text

from collections import defaultdict

from datetime import date, datetime

import pprint

import utils as utl
import params as prm

In [58]:
#Use your own key
# read secret key from secret file 
with open('secret.txt', 'r') as f:
    openai.api_key = f.read()

In [59]:
# Returns the current local date
today = date.today()

print("Today date is: ", today)

Today date is:  2023-03-14


In [60]:
session_name = 'good_incontext_examples_for_gpt3'
session_date = today

In [61]:
# Connect to user's database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database

utl.create_table(conn, prm.SESSION_TABLE_SQL) # Create table if it does not exist
# Create interaction table for the session
utl.create_table(conn, f"CREATE TABLE IF NOT EXISTS interaction_{session_name} (session_name, interaction_type, text, embedding, num_tokens_oai)")

tables = utl.parse_tables(conn) # Grab tables to check if the table was created
conn.close()

In [62]:
tables

[('session',), ('interaction_good_incontext_examples_for_gpt3',)]

In [63]:
# DB INTERACTION
# Write session data to the database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database
utl.insert_session(conn, session_name, session_date) # Insert session data  
conn.close()

Session: "good_incontext_examples_for_gpt3" inserted into the database


## 1. Prepare document for processing

In [64]:
# Logic for chapter capture
# digit_word_rgx = r'^\d+(\.\d+)*\s+[A-Z].*$|Abstract'
digit_word_rgx = r'^\d+(\.\d+)*\s[a-zA-Z]+.*$|Abstract'

In [65]:
# Load context document and grab chapter names
doc_name = 'What Makes Good InContext Examples for GPT3.pdf'

pdf_text = extract_text(prm.IN_PTH / doc_name)

# TODO: user should decide which split logic to use: random split (text be be cut into equal parts containing more-less the same number of tokens) or split by chapters (chapter inference is not always correct as pdfs are not always formated nicely). 
# TODO: Implement interface for split choice + equal chunking function
chapters = utl.grab_chapters(pdf_text, matching_logic=2)

In [66]:
# See what chapters got captured from the pdf
chapters

['Abstract',
 '3 keeps generating tokens until there is a special char-',
 '2 Method',
 '2.1 GPT-3 for In-Context Learning',
 '2.2 The Impact of In-Context Examples',
 '2.3 kNN-augmented In-Context Example',
 '3 Experimental Setup',
 '3.1 Sentence Embeddings for Retrieval',
 '3 on open-domain QA tasks. The EM score is',
 '3.2 Baseline Methods',
 '4 Experimental Results',
 '4.1 Sentiment Analysis',
 '2 dataset, the accuracy of kNNsst-2 is 92.46, which',
 '4.2 Table-to-text Generation',
 '4.3 Questing Answering',
 '64 nearest neighbors (10 for TriviaQA) to deter-',
 '3 to misinterpret the question as asking for a spe-',
 '64 examples.',
 '5 Analysis and Ablation Study',
 '5.1 Number of In-context Examples',
 '5.2 Size of Training Set for Retrieval',
 '3 to better answer the questions.',
 '5.3 Order of In-context Examples',
 '6 Related Work',
 '7 Conclusion']

In [67]:
# TODO: interface for selecting chapters to be used for text fragmentation. 
chapters = [
    'Abstract',
    '1 Introduction',
    '2 Method',
    '2.1 GPT-3 for In-Context Learning',
    '2.2 The Impact of In-Context Examples',
    '2.3 kNN-augmented In-Context Example',
    '3 Experimental Setup',
    '3.1 Sentence Embeddings for Retrieval',
    '3.2 Baseline Methods',
    '4 Experimental Results',
    '4.1 Sentiment Analysis',
    '4.2 Table-to-text Generation',
    '4.3 Questing Answering',
    '5 Analysis and Ablation Study',
    '5.1 Number of In-context Examples',
    '5.2 Size of Training Set for Retrieval',
    '5.3 Order of In-context Examples',
    '6 Related Work',
    '7 Conclusion'
]
chapters = [x.lower().strip() for x in chapters]
print(f"There are {len(chapters)} chapters in the document")


# Set a specific starting point of the document
# start = 'abstract'
start = chapters[0]

start = start.lower().strip()

There are 19 chapters in the document


In [68]:
# TODO: make this text pre-processing function
# define the text to be parsed
text = [] 
for line in pdf_text.split('\n'):
    # if len(line)<=1: #get rid of junk lines
    #     continue
    line = line.replace('\t', ' ')
    line = line.strip().lower()
    text.append(line)

text = ' '.join(text)
# replace newline and tab characters with space
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub(r'\s+', ' ', text)

In [69]:
# TODO: turn it into a function that will get triggered whe user decides to chop the document by chapter names

# Fragment the text according to the logic the user defined (currently - by chapters)

# join the end strings with | to form chapter end regex pattern
end_pattern = "|".join(chapters)

# match text between chapter and any end string
chapters_contents = {}
for string in chapters:

    pattern = rf"{string}(.*?)(" + end_pattern + "|$)"
    pattern = re.compile(pattern)



    # search for the pattern in the text
    match = pattern.search(text)

    # if there is a match, extract the text between string and any end-string
    if match:
        # get the first group of the match object, which is the text between given chapter and any end string
        result = match.group(1)

        # print or save or do whatever you want with the result
        chapters_contents[string] = result

In [70]:
#TODO: come up with test that checks if I grabbed all the chapters
fetched_chapters = [x for x in chapters_contents.keys()] 
# compare element wise fetched_chapters with chapters
[x for x in chapters if x not in fetched_chapters]

[]

In [71]:
# Manually inspect some chapters
chapters_contents['1 introduction']

' gpt-3 (brown et al., 2020) is a new breakthrough in nlp research. previously, nlp models are pre- trained on large quantities of data and ﬁne-tuned ∗work was done during an internship at microsoft dy- namics 365 ai. trial accuracy 1 94.6 2 95.0 3 95.8 4 93.9 5 86.9 table 1: results of gpt-3 on the task of sentiment analysis on the sst-2 dataset. five different in-context examples are randomly selected from the training set. we observe different contexts induce different accura- cies on the test set. on a speciﬁc task and dataset. what sets gpt-3 apart from other pre-trained language models is its impressive “in-context” few-shot learning ability. provided with a few in-context examples, gpt-3 is able to generalize to unseen cases without fur- ther ﬁne-tuning. this opens up many new tech- nological possibilities that are previously consid- ered unique to human. for example, nlp systems can be developed to expand emails, extract entities from text, generate code based on natural langua

In [72]:
# Grab contents into a dataframe
chapter_contents_df = pd.DataFrame(chapters_contents, index=['contents'])
chapter_contents_df = chapter_contents_df.T
chapter_contents_df

Unnamed: 0,contents
abstract,"gpt-3 (brown et al., 2020) has attracted lots..."
1 introduction,"gpt-3 (brown et al., 2020) is a new breakthro..."
2 method,
2.1 gpt-3 for in-context learning,the in-context learning scenario of gpt-3 can...
2.2 the impact of in-context examples,given the observation that the empirical resu...
2.3 knn-augmented in-context example,"selection based on the ﬁndings above, we prop..."
3 experimental setup,we apply the knn in-context selection method ...
3.1 sentence embeddings for retrieval,to retrieve semantically-similar training ins...
3.2 baseline methods,"random sampling for each test sentence, we ra..."
4 experimental results,


In [73]:
# Create a token count column
chapter_contents_df['num_tokens_oai'] = chapter_contents_df['contents'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

In [74]:
# For instances with token count > token_thres, split them so they fit model threshold so we could get their embeddings
# TODO: make it split actually by tokens, not by characters

token_thres = 1000

chapter_contents_df['split_factor'] = 1
chapter_contents_df.loc[chapter_contents_df['num_tokens_oai'] > token_thres, 'split_factor'] = round(chapter_contents_df['num_tokens_oai']/token_thres,0)


chapter_contents_df['contents_split'] = chapter_contents_df.apply(
    lambda x: utl.split_contents(x), axis=1)


chapter_contents_long_df = chapter_contents_df.explode(
    column='contents_split'
)[['contents_split']]

In [75]:
# Create a token count column
chapter_contents_long_df['num_tokens_oai'] = chapter_contents_long_df['contents_split'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

chapter_contents_long_df

Unnamed: 0,contents_split,num_tokens_oai
abstract,"gpt-3 (brown et al., 2020) has attracted lots...",338
1 introduction,"gpt-3 (brown et al., 2020) is a new breakthro...",1053
2 method,,7
2.1 gpt-3 for in-context learning,the in-context learning scenario of gpt-3 can...,489
2.2 the impact of in-context examples,given the observation that the empirical resu...,377
2.3 knn-augmented in-context example,"selection based on the ﬁndings above, we prop...",833
3 experimental setup,we apply the knn in-context selection method ...,246
3.1 sentence embeddings for retrieval,to retrieve semantically-similar training ins...,869
3.2 baseline methods,"random sampling for each test sentence, we ra...",416
4 experimental results,,7


In [76]:
chapter_contents_long_df['text'] = "CHAPTER: " +  chapter_contents_long_df.index + " CONTENT: " + chapter_contents_long_df['contents_split']

In [77]:
# Drop contents_split column
chapter_contents_long_df = chapter_contents_long_df.drop(columns=['contents_split'])
# Reset index so chapter names are stored in columns
chapter_contents_long_df = chapter_contents_long_df.reset_index()
# Rename index column to chapter
chapter_contents_long_df = chapter_contents_long_df.rename(columns={'index': 'chapter'})
# Add session_name column
chapter_contents_long_df['session_name'] = session_name
## Add interaction type column
chapter_contents_long_df['interaction_type'] = 'source'
## Drop rows where num_tokens_oai is less than 100
chapter_contents_long_df = chapter_contents_long_df[chapter_contents_long_df['num_tokens_oai'] > 50].copy()

In [78]:
chapter_contents_long_df

Unnamed: 0,chapter,num_tokens_oai,text,session_name,interaction_type
0,abstract,338,CHAPTER: abstract CONTENT: gpt-3 (brown et al...,good_incontext_examples_for_gpt3,source
1,1 introduction,1053,CHAPTER: 1 introduction CONTENT: gpt-3 (brown...,good_incontext_examples_for_gpt3,source
3,2.1 gpt-3 for in-context learning,489,CHAPTER: 2.1 gpt-3 for in-context learning CON...,good_incontext_examples_for_gpt3,source
4,2.2 the impact of in-context examples,377,CHAPTER: 2.2 the impact of in-context examples...,good_incontext_examples_for_gpt3,source
5,2.3 knn-augmented in-context example,833,CHAPTER: 2.3 knn-augmented in-context example ...,good_incontext_examples_for_gpt3,source
6,3 experimental setup,246,CHAPTER: 3 experimental setup CONTENT: we app...,good_incontext_examples_for_gpt3,source
7,3.1 sentence embeddings for retrieval,869,CHAPTER: 3.1 sentence embeddings for retrieval...,good_incontext_examples_for_gpt3,source
8,3.2 baseline methods,416,CHAPTER: 3.2 baseline methods CONTENT: random...,good_incontext_examples_for_gpt3,source
10,4.1 sentiment analysis,373,CHAPTER: 4.1 sentiment analysis CONTENT: we ﬁ...,good_incontext_examples_for_gpt3,source
11,4.2 table-to-text generation,424,CHAPTER: 4.2 table-to-text generation CONTENT:...,good_incontext_examples_for_gpt3,source


## 2. Get embeddings for each content piece

In [79]:
contents_for_embed_df = chapter_contents_long_df[['text']]

In [80]:
# Calculate the cost of running the model to get embeddings
(chapter_contents_long_df['num_tokens_oai'].sum() / 1000) * 0.0004

0.0039412

In [81]:
# Commented out because I already ran this
# If you want to get embeddings on a different file, 
# uncomment this and run it

# Embed each chapter


# rng = tqdm(range(0,len(contents_for_embed_df)))

# contents_embedded = {}

# for i in rng:
#     txt_chapter = contents_for_embed_df.index[i]
#     txt_list = contents_for_embed_df.iloc[i].to_list()

#     txt_embed = utl.get_embedding(txt_list)



#     # Join embeddings with context table
#     contents_embedded[txt_chapter] = txt_embed
# embeded_s = pd.Series(contents_embedded, index=contents_embedded.keys())

# # Merge embeddings with chapter contents
# chapter_contents_long_df['embedding'] = embeded_s
# chapter_contents_long_df.head()

# # Save embeddings
# chapter_contents_long_df.to_csv(f'./data/{session_name}_embeded.csv')

## 3. Find most similar document embeddings to the question embedding

In [82]:
# Read embeddings
chapter_contents_long_df = pd.read_csv(prm.D_PTH  / f'{session_name}_embeded.csv', index_col=0)

In [83]:
# DB INTERACTION
# Insert context into DB
conn = utl.create_connection(prm.DB_PTH)
utl.insert_context(conn, session_name, chapter_contents_long_df)
conn.close()

Context table for session: "good_incontext_examples_for_gpt3" created


In [84]:
# To recognize previous interactions and to provide a summary of text we will implement a pre-call to gpt turbo that will classify the interaction type

"""
SYSTEM INSTRUCTION: classify user query either as RECALL REQUEST, SOURCE QUERY or SUMMARY REQUEST. Recall request means that user asks you to recall some information from the conversation. Source query is when user asks a specific question about the source text. Summary request is when user asks general question about the text (for example - what's this text about). I want your response being either RECALL REQUEST, SOURCE QUERY, SUMMARY REQUEST
"""

# Hold previous model response in context

"\nSYSTEM INSTRUCTION: classify user query either as RECALL REQUEST, SOURCE QUERY or SUMMARY REQUEST. Recall request means that user asks you to recall some information from the conversation. Source query is when user asks a specific question about the source text. Summary request is when user asks general question about the text (for example - what's this text about). I want your response being either RECALL REQUEST, SOURCE QUERY, SUMMARY REQUEST\n"

In [93]:
# POC - conversational interface

## User input
question = input("> ")



## Fetch recal table so we can compare user input to embeddings saved in it and fetch the right context.
recal_table = utl.fetch_recall_table(session_name)

## Chop recall table to only include contexts for sources, user, or assistant
recal_table_source = recal_table[recal_table['interaction_type'] == 'source']
recal_table_user = recal_table[recal_table['interaction_type'] == 'user']
recal_table_assistant = recal_table[recal_table['interaction_type'] == 'assistant']

recal_embed_source = utl.convert_table_to_dct(recal_table_source)
recal_embed_user = utl.convert_table_to_dct(recal_table_user)
recal_embed_assistant = utl.convert_table_to_dct(recal_table_assistant)

## Get the context from recal table that is the most similar to user input

## Get SRC context
if len(recal_embed_source) == 0:
    recal_source = 'No context found'
else:
    recal_source_id = utl.order_document_sections_by_query_similarity(question, recal_embed_source)[0][1]
    recal_source = recal_table.loc[recal_source_id]['text']
## GET QRY context
if len(recal_embed_user) == 0:
    recal_user = 'No context found'
else:
    recal_user_id = utl.order_document_sections_by_query_similarity(question, recal_embed_user)[0][1]
    recal_user = recal_table.loc[recal_user_id]['text']
## GET RPL context
if len(recal_embed_assistant) == 0:
    recal_assistant = 'No context found'
else:
    recal_assistant_id = utl.order_document_sections_by_query_similarity(question, recal_embed_assistant)[0][1]
    recal_assistant = recal_table.loc[recal_assistant_id]['text']



## Grab chapter name if it exists, otherwise use session name
## It will become handy when user wants to know from which chapter the context was taken
source_chapter_val = recal_table.loc[recal_source_id]['chapter']


###############################################################################
# Set-up system prompts. This is done once for the whole session and the setup depends on the type of assistant chosen by the user.
# I comnment this out as this needs to be saved in the interaction table and I will implement this after user-assistant interactions are implemented.
#
# sys_message = [
#     {'role': 'system', 'content': f"You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it."}
#     ]
# api_response = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=sys_message
# )
###############################################################################

## Form user message based on recaled context and user's input
usr_message = [
    {
        'role': 'system', 
        'content': "You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it. I provide my input after INP tag. I will pass the context you will use in your answer. I encode it with following tags: SRC - sources we are talking about; QRY - one of previous inputs I passed to you in the conversation; RPL - one of your previous replies to my questions from the conversation."
        },
    {   
        "role": "user", 
        "content": f"SRC: {recal_source}. QRY: {recal_user}. RPL: {recal_assistant}. INP: {question}"
        }
    ]
# Grab call user content from messages alias
usr_message_content = usr_message[0]['content']


## Make API call with formed user message
api_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=usr_message
)


# Open DB so the assistant can remember the conversation
conn = utl.create_connection(prm.DB_PTH)
# Insert user message into DB so we can use it for another user's input
utl.insert_interaction(
    conn, 
    session_name, 
    'user', 
    question
    )
# Insert model's response into DB so we can use it for another user's input
utl.insert_interaction(
    conn,
    session_name,
    'assistant',
    api_response['choices'][0]['message']['content']
    )
conn.close()




## Print CALL and RESPONSE
pprint.pprint('USER:')
pprint.pprint(question)
pprint.pprint('---------------------------------')
pprint.pprint('ASSISTANT:')
pprint.pprint(api_response['choices'][0]['message']['content'])
## Print additional context data:
pprint.pprint('---------------------------------')
pprint.pprint('CONTEXT:')
pprint.pprint(f"SOURCE: {recal_source}")
pprint.pprint(f"USER: {recal_user}")
pprint.pprint(f"ASSISTANT: {recal_assistant}")


Interaction type: "user" inserted into the database for session: "good_incontext_examples_for_gpt3"
Interaction type: "assistant" inserted into the database for session: "good_incontext_examples_for_gpt3"
'USER:'
'what is your opinion on that?'
'---------------------------------'
'ASSISTANT:'
('As an AI language model, I do not have personal opinions. However, based on '
 'the information provided in the source, it appears that the approach of '
 'retrieving semantically-similar examples to formulate a prompt has shown to '
 'be effective in selecting in-context examples for sentiment analysis. It '
 'also seems that fine-tuning on a similar task can benefit the performance of '
 'the model. However, it is not clear from the given content about the '
 'relevance or feasibility of using conversation history to build context.')
'---------------------------------'
'CONTEXT:'
('SOURCE: CHAPTER: 4.1 sentiment analysis CONTENT:  we ﬁrst evaluate kate on '
 'the sentiment anal- ysis task. the