## ALEPH
a PoC for conversational research environment with ADA and GPT3.5

Sources:
- How_to_format_inputs_to_ChatGPT_models: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
- Question_answering_using_embeddings: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

In [72]:
# TODO: come up with complexity control of the API call (count the number of tokens, manage contexts and so on)
# TODO: implement chat memory (sqlite3)

In [73]:
import pandas as pd
import numpy as np
import openai, pickle, tiktoken, ast

from pathlib import Path
from tqdm.notebook import tqdm

import re, sqlite3
from pdfminer.high_level import extract_text
from collections import defaultdict

from datetime import date, datetime

import pprint

import utils as utl
import params as prm

In [74]:
#Use your own key
# read secret key from secret file 
with open('secret.txt', 'r') as f:
    openai.api_key = f.read()

In [75]:
# Returns the current local date
today = date.today()
# get current time in seconds (UNIX timestamp)
timestmp = datetime.now().timestamp()


print("Today date is: ", today)
print("Current time as UNIX timestamp is: ", timestmp)

Today date is:  2023-03-13
Current time as UNIX timestamp is:  1678741036.720114


In [76]:
session_name = 'biol_cognition'
session_date = today

In [77]:
# Connect to user's database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database

utl.create_table(conn, prm.SESSION_TABLE_SQL) # Create table if it does not exist
# Create interaction table for the session
utl.create_table(conn, f"CREATE TABLE IF NOT EXISTS interaction_{session_name} (session_name, interaction_type, text, embedding, num_tokens_oai)")

tables = utl.parse_tables(conn) # Grab tables to check if the table was created
conn.close()

In [78]:
tables

[('session',), ('interaction_biol_cognition',), ('context_biol_cognition',)]

In [79]:
# DB INTERACTION
# Write session data to the database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database
utl.insert_session(conn, session_name, session_date) # Insert session data  
conn.close()

Session: "biol_cognition" already in the database


## 1. Prepare document for processing

In [80]:
# Logic for chapter capture
digit_word_rgx = r'^\d+(\.\d+)*\s+[A-Z].*$'

In [81]:
# Grab chapter names
pdf_text = extract_text(prm.IN_DOC_PTH)

chapter_pattern = re.compile(digit_word_rgx)

chapters = []
for line in pdf_text.split('\n'):
    if chapter_pattern.match(line):
        chapters.append(line.strip())

chapters

['477 Williamstown Road, Port Melbourne, VIC 3207, Australia',
 '103 Penang Road, #05-06/07, Visioncrest Commercial, Singapore 238467',
 '1 Introduction',
 '2 Seeing Things That Matter',
 '3 Learning What Matters',
 '4 Social Living',
 '5 Complex Trade-Oﬀs and Higher Cognition',
 '1 Introduction',
 '1.1 Preserving Viability',
 '1.2 Embodiment as a Core Principle of Biological Cognition',
 '1.3 Pluralism about Processes',
 '1.4 Integration in Heterarchical Networks',
 '1.5 Is Cognition Governed by a Hierarchical Model?',
 '2 Seeing Things That Matter',
 '2.1 Early Gestures toward a Constructive Theory of Vision',
 '2.2 A More Biological Approach to Vision',
 '2.3 Early Vision',
 '2.4 Attentional, Aﬀective, and Visceral Information',
 '2.5 Cortical Processing',
 '2.6 Perceptual Integration',
 '3 Learning What Matters',
 '3.1 Sodium Regulation',
 '3.2 Physiological Regulation and Cognition',
 '3.3 A Uniﬁed Story about Learning?',
 '3.4 Learning about Threats and Dangers',
 '4 Social Livin

In [82]:
# Choose what chapters you want to grab
# TODO: make it automated somehow (infer the structure)
chapters = [
    '1 Introduction',
    '2 Seeing Things That Matter',
    '3 Learning What Matters',
    '4 Social Living',
    '5 Complex Trade-Oﬀs and Higher Cognition',
    '1.1 Preserving Viability',
    '1.2 Embodiment as a Core Principle of Biological Cognition',
    '1.3 Pluralism about Processes',
    '1.4 Integration in Heterarchical Networks',
    '1.5 Is Cognition Governed by a Hierarchical Model?',
    '2.1 Early Gestures toward a Constructive Theory of Vision',
    '2.2 A More Biological Approach to Vision',
    '2.3 Early Vision',
    '2.4 Attentional, Aﬀective, and Visceral Information',
    '2.5 Cortical Processing',
    '2.6 Perceptual Integration',
    '3.1 Sodium Regulation',
    '3.2 Physiological Regulation and Cognition',
    '3.3 A Uniﬁed Story about Learning?',
    '3.4 Learning about Threats and Dangers',
    '4.1 Social and Evolutionary Constraints',
    '4.2 Chemical Signals and Sociality'
]
chapters = [x.lower().strip() for x in chapters]
chapters

['1 introduction',
 '2 seeing things that matter',
 '3 learning what matters',
 '4 social living',
 '5 complex trade-oﬀs and higher cognition',
 '1.1 preserving viability',
 '1.2 embodiment as a core principle of biological cognition',
 '1.3 pluralism about processes',
 '1.4 integration in heterarchical networks',
 '1.5 is cognition governed by a hierarchical model?',
 '2.1 early gestures toward a constructive theory of vision',
 '2.2 a more biological approach to vision',
 '2.3 early vision',
 '2.4 attentional, aﬀective, and visceral information',
 '2.5 cortical processing',
 '2.6 perceptual integration',
 '3.1 sodium regulation',
 '3.2 physiological regulation and cognition',
 '3.3 a uniﬁed story about learning?',
 '3.4 learning about threats and dangers',
 '4.1 social and evolutionary constraints',
 '4.2 chemical signals and sociality']

In [83]:
# define the text to be parsed
text = [] 
for line in pdf_text.split('\n'):
    if len(line)<=1:
        continue
    line = line.replace('\t', ' ')
    line = line.replace('  ', ' ')
    line = line.strip().lower()
    text.append(line)

text = ' '.join(text)

In [84]:
# Set a specific starting point of the document
start = '1 Introduction Cognitive scientists'

start = start.lower().strip()
pattern = rf'{start}.*$'
pattern = re.compile(pattern)

# search for the pattern in the text
match = pattern.search(text)
if match:
    text = match.group(0)
else:
    print(f'Couldnt match: {pattern}')

In [85]:
# escape any special characters in the end strings
# end_strings = [re.escape(s) for s in chapters]
end_strings = chapters

# join the end strings with | to form a regex pattern
end_pattern = "|".join(end_strings)

# define the regex pattern to match text between chapter and any end string
chapters_contents = {}
for string in end_strings:

    pattern = rf"{string}(.*?)(" + end_pattern + "|$)"

    # compile the pattern
    pattern = re.compile(pattern)

    # search for the pattern in the text
    match = pattern.search(text)

    # if there is a match, extract the text between string and any end-string
    if match:
        # get the first group of the match object, which is the text between given chapter and any end string
        result = match.group(1)
        # print or save or do whatever you want with the result
        chapters_contents[string] = result

In [86]:
#TODO: come up with test that checks if I grabbed all the chapters
[x for x in chapters_contents.keys()]

['1 introduction',
 '2 seeing things that matter',
 '3 learning what matters',
 '4 social living',
 '5 complex trade-oﬀs and higher cognition',
 '1.1 preserving viability',
 '1.2 embodiment as a core principle of biological cognition',
 '1.3 pluralism about processes',
 '1.4 integration in heterarchical networks',
 '1.5 is cognition governed by a hierarchical model?',
 '2.1 early gestures toward a constructive theory of vision',
 '2.2 a more biological approach to vision',
 '2.3 early vision',
 '2.4 attentional, aﬀective, and visceral information',
 '2.5 cortical processing',
 '2.6 perceptual integration',
 '3.1 sodium regulation',
 '3.2 physiological regulation and cognition',
 '3.3 a uniﬁed story about learning?',
 '3.4 learning about threats and dangers',
 '4.1 social and evolutionary constraints',
 '4.2 chemical signals and sociality']

In [87]:
# Manually inspect some chapters
chapters_contents['5 complex trade-oﬀs and higher cognition'][-100:]

'gnition bryce huebner and jay schulkin a full series listing is available at: www.cambridge.org/epmi'

In [88]:
# Grab contents into a dataframe
chapter_contents_df = pd.DataFrame(chapters_contents, index=['contents'])
chapter_contents_df = chapter_contents_df.T
chapter_contents_df

Unnamed: 0,contents
1 introduction,cognitive scientists often note that neural n...
2 seeing things that matter,"animals must track threats and dangers, respo..."
3 learning what matters,william james (1890) famously asks readers to...
4 social living,"humans are more helpful, generous, and social..."
5 complex trade-oﬀs and higher cognition,research in computational neuroscience is beg...
1.1 preserving viability,elephants are large animals who cover long di...
1.2 embodiment as a core principle of biological cognition,an inﬂuential understanding of this form of e...
1.3 pluralism about processes,animals must track potential dangers while mo...
1.4 integration in heterarchical networks,a mixing console receives numerous kinds of s...
1.5 is cognition governed by a hierarchical model?,? suppose you expect a cup to be ﬁlled with a ...


In [89]:
# TODO: Bespoke cleanup rgxs
# To get rid of unwanted patterns
# rgx1_start = r's s e r P y t i s r e v i n U e g d i r b m a C y b e n'
# rgx1_end1 = r'Philosophy of Mind'
# rgx2_end2 = r'Biological Cognition'

In [90]:
# Create a token count column
chapter_contents_df['num_tokens_oai'] = chapter_contents_df['contents'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

In [91]:
# For instances with token count > token_thres, split them so they fit model threshold so we could get their embeddings
# TODO: make it split actually by tokens, not by characters

token_thres = 1600

chapter_contents_df['split_factor'] = 1
chapter_contents_df.loc[chapter_contents_df['num_tokens_oai'] > token_thres, 'split_factor'] = round(chapter_contents_df['num_tokens_oai']/token_thres,0)


chapter_contents_df['contents_split'] = chapter_contents_df.apply(
    lambda x: utl.split_contents(x), axis=1)


chapter_contents_long_df = chapter_contents_df.explode(
    column='contents_split'
)[['contents_split']]

In [92]:
# Create a token count column
chapter_contents_long_df['num_tokens_oai'] = chapter_contents_long_df['contents_split'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

chapter_contents_long_df

Unnamed: 0,contents_split,num_tokens_oai
1 introduction,cognitive scientists often note that neural n...,1118
2 seeing things that matter,"animals must track threats and dangers, respo...",460
3 learning what matters,william james (1890) famously asks readers to...,1309
4 social living,"humans are more helpful, generous, and social...",1027
5 complex trade-oﬀs and higher cognition,research in computational neuroscience is beg...,1151
5 complex trade-oﬀs and higher cognition,ant to explaining thought and behavior change ...,1147
5 complex trade-oﬀs and higher cognition,ren who biological cognition 65 acquire a sign...,1127
5 complex trade-oﬀs and higher cognition,of a more general fact about structure learnin...,1690
5 complex trade-oﬀs and higher cognition,"cs, 3, 363–388. brezina, v. (2010). beyond the...",1881
5 complex trade-oﬀs and higher cognition,"iour in anurans: perception, learning. compara...",1826


## 2. Get embeddings for each content piece

In [93]:
# Calculate the cost of running the model to get embeddings
(chapter_contents_long_df['num_tokens_oai'].sum() / 1000) * 0.0004

0.019188399999999998

In [94]:
chapter_contents_long_df['combined'] = "CHAPTER: " +  chapter_contents_long_df.index + " CONTENT: " + chapter_contents_long_df['contents_split']

In [95]:
contents_for_embed_df = chapter_contents_long_df[['combined']]

In [96]:
# Commented out because I already ran this
# If you want to get embeddings on a different file, 
# uncomment this and run it

## Embed each chapter
# rng = tqdm(range(0,len(contents_for_embed_df)))

# contents_embedded = {}

# for i in rng:
#     txt_chapter = contents_for_embed_df.index[i]
#     txt_list = contents_for_embed_df.iloc[i].to_list()

#     txt_embed = utl.get_embedding(txt_list)

#     contents_embedded[txt_chapter] = txt_embed

In [97]:
# embeded_s = pd.Series(contents_embedded, index=contents_embedded.keys())

# # Merge embeddings with chapter contents
# chapter_contents_long_df['embedding'] = embeded_s
# chapter_contents_long_df.head()

In [98]:
# # Save embeddings
#chapter_contents_long_df.to_csv('./biol_cog_embeded.csv')

## 3. Find most similar document embeddings to the question embedding

In [99]:
# Read embeddings
chapter_contents_long_df = pd.read_csv(prm.D_PTH  / 'biol_cog_embeded.csv', index_col=0)
# Drop contents_split column
chapter_contents_long_df = chapter_contents_long_df.drop(columns=['contents_split'])
# Reset index so chapter names are stored in columns
chapter_contents_long_df = chapter_contents_long_df.reset_index()
# Rename index column to chapter
chapter_contents_long_df = chapter_contents_long_df.rename(columns={'index': 'chapter', 'combined': 'text'})
# Add session_name column
chapter_contents_long_df['session_name'] = session_name
## Add interaction type column
chapter_contents_long_df['interaction_type'] = 'source'
## Drop rows where num_tokens_oai is less than 100
chapter_contents_long_df = chapter_contents_long_df[chapter_contents_long_df['num_tokens_oai'] > 100].copy()

In [100]:
# DB INTERACTION
# Insert context into DB
conn = utl.create_connection(prm.DB_PTH)
utl.insert_context(conn, session_name, chapter_contents_long_df)
conn.close()

Context table for session: "biol_cognition" created


In [101]:
chapter_contents_long_df

Unnamed: 0,chapter,num_tokens_oai,text,embedding,session_name,interaction_type
0,1 introduction,1118,CHAPTER: 1 introduction CONTENT: cognitive sc...,"[-0.016917899250984192, 0.023071041330695152, ...",biol_cognition,source
1,2 seeing things that matter,460,CHAPTER: 2 seeing things that matter CONTENT: ...,"[-0.007168305106461048, 0.013002508319914341, ...",biol_cognition,source
2,3 learning what matters,1309,CHAPTER: 3 learning what matters CONTENT: wil...,"[-0.007003807462751865, 0.017418865114450455, ...",biol_cognition,source
3,4 social living,1027,CHAPTER: 4 social living CONTENT: humans are ...,"[-0.014815716072916985, 0.0013720998540520668,...",biol_cognition,source
4,5 complex trade-oﬀs and higher cognition,1151,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source
5,5 complex trade-oﬀs and higher cognition,1147,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source
6,5 complex trade-oﬀs and higher cognition,1127,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source
7,5 complex trade-oﬀs and higher cognition,1690,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source
8,5 complex trade-oﬀs and higher cognition,1881,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source
9,5 complex trade-oﬀs and higher cognition,1826,CHAPTER: 5 complex trade-oﬀs and higher cognit...,"[-0.0069809565320611, -0.007067354395985603, 0...",biol_cognition,source


In [102]:
# Turn chapter_contents_long_df into a dictionary of chapter names and embeddings

contents_embed_dct = chapter_contents_long_df[['embedding']].apply(list).to_dict()['embedding']

In [104]:
# POC - conversational interface

## User input
question = input("> ")



## Fetch recal table so we can compare user input to embeddings saved in it and fetch the right context.
recal_table = utl.fetch_recall_table(session_name)

## Chop recall table to only include contexts for sources, user, or assistant
recal_table_source = recal_table[recal_table['interaction_type'] == 'source']
recal_table_user = recal_table[recal_table['interaction_type'] == 'user']
recal_table_assistant = recal_table[recal_table['interaction_type'] == 'assistant']

recal_embed_source = utl.convert_table_to_dct(recal_table_source)
recal_embed_user = utl.convert_table_to_dct(recal_table_user)
recal_embed_assistant = utl.convert_table_to_dct(recal_table_assistant)

## Get the context from recal table that is the most similar to user input

## Get SRC context
if len(recal_embed_source) == 0:
    recal_source = 'No context found'
else:
    recal_source_id = utl.order_document_sections_by_query_similarity(question, recal_embed_source)[0][1]
    recal_source = recal_table.loc[recal_source_id]['text']
## GET QRY context
if len(recal_embed_user) == 0:
    recal_user = 'No context found'
else:
    recal_user_id = utl.order_document_sections_by_query_similarity(question, recal_embed_user)[0][1]
    recal_user = recal_table.loc[recal_user_id]['text']
## GET RPL context
if len(recal_embed_assistant) == 0:
    recal_assistant = 'No context found'
else:
    recal_assistant_id = utl.order_document_sections_by_query_similarity(question, recal_embed_assistant)[0][1]
    recal_assistant = recal_table.loc[recal_assistant_id]['text']



## Grab chapter name if it exists, otherwise use session name
## It will become handy when user wants to know from which chapter the context was taken
source_chapter_val = recal_table.loc[recal_source_id]['chapter']


###############################################################################
# Set-up system prompts. This is done once for the whole session and the setup depends on the type of assistant chosen by the user.
# I comnment this out as this needs to be saved in the interaction table and I will implement this after user-assistant interactions are implemented.
#
# sys_message = [
#     {'role': 'system', 'content': f"You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it."}
#     ]
# api_response = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=sys_message
# )
###############################################################################

## Form user message based on recaled context and user's input
usr_message = [
    {
        'role': 'system', 
        'content': "You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it. I provide my input after INP tag. I will pass the context you will use in your answer. I encode it with following tags: SRC - sources we are talking about, they have the priority above other contexts; QRY - one of previous inputs I passed to you in the conversation; RPL - one of your previous replies to my questions from the conversation."
        },
    {   
        "role": "user", 
        "content": f"SRC: {recal_source}. QRY: {recal_user}. RPL: {recal_assistant}. INP: {question}"
        }
    ]
# Grab call user content from messages alias
usr_message_content = usr_message[0]['content']


## Make API call with formed user message
api_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=usr_message
)


# Open DB so the assistant can remember the conversation
conn = utl.create_connection(prm.DB_PTH)
# Insert user message into DB so we can use it for another user's input
utl.insert_interaction(
    conn, 
    session_name, 
    'user', 
    question
    )
# Insert model's response into DB so we can use it for another user's input
utl.insert_interaction(
    conn,
    session_name,
    'assistant',
    api_response['choices'][0]['message']['content']
    )
conn.close()




## Print CALL and RESPONSE
pprint.pprint('USER:')
pprint.pprint(question)
pprint.pprint('---------------------------------')
pprint.pprint('ASSISTANT:')
pprint.pprint(api_response['choices'][0]['message']['content'])
## Print additional context data:
pprint.pprint('---------------------------------')
pprint.pprint('CONTEXT:')
pprint.pprint(f"SOURCE: {recal_source}")
pprint.pprint(f"USER: {recal_user}")
pprint.pprint(f"ASSISTANT: {recal_assistant}")


Interaction type: "user" inserted into the database for session: "biol_cognition"
Interaction type: "assistant" inserted into the database for session: "biol_cognition"
'USER:'
'What is the role of testosterone in cognition?'
'---------------------------------'
'ASSISTANT:'
('There is some discussion in the passage about changes in the chemical '
 'signaling systems involved in social and self-regulation, including '
 'oxytocin, corticotropin-releasing hormone, serotonin, and testosterone. '
 'However, the passage does not go into detail about the specific role of '
 'testosterone in cognition.')
'---------------------------------'
'CONTEXT:'
('SOURCE: CHAPTER: 4.1 social and evolutionary constraints CONTENT:  in '
 'building our case, it will help to begin by considering a common explanation '
 'of the social and cognitive capacities of modern dogs. social tendencies are '
 'observed among all canines, but the fact that dogs are more sensitive to '
 'human communication reﬂects a long