## ALEPH
a PoC for conversational research environment with ADA and GPT3.5

Sources:
- How_to_format_inputs_to_ChatGPT_models: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
- Question_answering_using_embeddings: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

In [296]:
# TODO: come up with complexity control of the API call (count the number of tokens, manage contexts and so on)
# TODO: implement chat memory (sqlite3)

In [297]:
import pandas as pd
import numpy as np
import openai, pickle, tiktoken, ast

from pathlib import Path
from tqdm.notebook import tqdm

import re, sqlite3

from pdfminer.high_level import extract_text

from collections import defaultdict

from datetime import date, datetime

import pprint

import utils as utl
import params as prm

In [298]:
#Use your own key
# read secret key from secret file 
with open('secret.txt', 'r') as f:
    openai.api_key = f.read()

In [299]:
# Returns the current local date
today = date.today()

print("Today date is: ", today)

Today date is:  2023-03-14


In [300]:
session_name = 'computational_phenomenology'
session_date = today

In [301]:
# Connect to user's database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database

utl.create_table(conn, prm.SESSION_TABLE_SQL) # Create table if it does not exist
# Create interaction table for the session
utl.create_table(conn, f"CREATE TABLE IF NOT EXISTS interaction_{session_name} (session_name, interaction_type, text, embedding, num_tokens_oai)")

tables = utl.parse_tables(conn) # Grab tables to check if the table was created
conn.close()

In [302]:
tables

[('session',),
 ('interaction_computational_phenomenology',),
 ('context_computational_phenomenology',)]

In [303]:
# DB INTERACTION
# Write session data to the database
conn = utl.create_connection(prm.DB_PTH) # Create connection to the database
utl.insert_session(conn, session_name, session_date) # Insert session data  
conn.close()

Session: "computational_phenomenology" already in the database


## 1. Prepare document for processing

In [304]:
# Logic for chapter capture
# digit_word_rgx = r'^\d+(\.\d+)*\s+[A-Z].*$|Abstract'
digit_word_rgx = r'^\d+(\.\d+)*\s[a-zA-Z]+.*$|Abstract'

In [305]:
# Load context document and grab chapter names
doc_name = 'Rejecting Cognitivism - Computational.pdf'

pdf_text = extract_text(prm.IN_PTH / doc_name)

# TODO: user should decide which split logic to use: random split (text be be cut into equal parts containing more-less the same number of tokens) or split by chapters (chapter inference is not always correct as pdfs are not always formated nicely). 
# TODO: Implement interface for split choice + equal chunking function
chapters = utl.grab_chapters(pdf_text, matching_logic=2)

In [306]:
# See what chapters got captured from the pdf
chapters

['Abstract',
 '2 Cognitivism and neuro-representationalism in deep',
 '3 The phenomenological critique of cognitivism: im-',
 '4 Computational phenomenology and deep learning',
 '270 degrees) of rotated images, also relies on the orientation of objects and',
 '4.2 Sartrean imagination: conscious re-presentation of sedi-',
 '4.3 Language: concepts as conscious re-presentations',
 '5 A new toolkit for deep learning and a novel math-']

In [307]:
# TODO: interface for selecting chapters to be used for text fragmentation. 
chapters = [
    'Abstract',
    '1 Introduction',
    '2 Cognitivism and neuro-representationalism in deep',
    '3 The phenomenological critique of cognitivism: im-',
    '4 Computational phenomenology and deep learning',
    '4.1 Merleau-pontian perception: a new setting in which to consider learning',
    '4.2 Sartrean imagination: conscious re-presentation of sedi-',
    '4.3 Language: concepts as conscious re-presentations',
    '5 A new toolkit for deep learning and a novel math-'
]
chapters = [x.lower().strip() for x in chapters]
print(f"There are {len(chapters)} chapters in the document")


# Set a specific starting point of the document
# start = 'abstract'
start = chapters[0]

start = start.lower().strip()

There are 9 chapters in the document


In [308]:
# TODO: make this text pre-processing function
# define the text to be parsed
text = [] 
for line in pdf_text.split('\n'):
    # if len(line)<=1: #get rid of junk lines
    #     continue
    line = line.replace('\t', ' ')
    line = line.strip().lower()
    text.append(line)

text = ' '.join(text)
# replace newline and tab characters with space
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub(r'\s+', ' ', text)

In [309]:
# TODO: turn it into a function that will get triggered whe user decides to chop the document by chapter names

# Fragment the text according to the logic the user defined (currently - by chapters)

# join the end strings with | to form chapter end regex pattern
end_pattern = "|".join(chapters)

# match text between chapter and any end string
chapters_contents = {}
for string in chapters:

    pattern = rf"{string}(.*?)(" + end_pattern + "|$)"
    pattern = re.compile(pattern)



    # search for the pattern in the text
    match = pattern.search(text)

    # if there is a match, extract the text between string and any end-string
    if match:
        # get the first group of the match object, which is the text between given chapter and any end string
        result = match.group(1)

        # print or save or do whatever you want with the result
        chapters_contents[string] = result

In [310]:
#TODO: come up with test that checks if I grabbed all the chapters
fetched_chapters = [x for x in chapters_contents.keys()] 
# compare element wise fetched_chapters with chapters
[x for x in chapters if x not in fetched_chapters]

[]

In [311]:
# Manually inspect some chapters
chapters_contents['5 a new toolkit for deep learning and a novel math-'][-100:]

'ics in computational intelligence (vol. 5, issue 5). https://doi.org/10.1109/ tetci.2021.3100641 30 '

In [312]:
# Grab contents into a dataframe
chapter_contents_df = pd.DataFrame(chapters_contents, index=['contents'])
chapter_contents_df = chapter_contents_df.T
chapter_contents_df

Unnamed: 0,contents
abstract,we propose a non-representationalist framewor...
1 introduction,"in the past years, deep learning (dl) has ach..."
2 cognitivism and neuro-representationalism in deep,learning dl is concerned with the design and ...
3 the phenomenological critique of cognitivism: im-,plications for deep learning as seen in the p...
4 computational phenomenology and deep learning,"in the previous section, we have highlighted ..."
4.1 merleau-pontian perception: a new setting in which to consider learning,merleau-ponty’s phenomenology gives a central...
4.2 sartrean imagination: conscious re-presentation of sedi-,mented experiences if anns are thought to mod...
4.3 language: concepts as conscious re-presentations,computational phenomenology also has to addre...
5 a new toolkit for deep learning and a novel math-,ematization of cognition computational phenom...


In [313]:
# Create a token count column
chapter_contents_df['num_tokens_oai'] = chapter_contents_df['contents'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

In [314]:
# For instances with token count > token_thres, split them so they fit model threshold so we could get their embeddings
# TODO: make it split actually by tokens, not by characters

token_thres = 1000

chapter_contents_df['split_factor'] = 1
chapter_contents_df.loc[chapter_contents_df['num_tokens_oai'] > token_thres, 'split_factor'] = round(chapter_contents_df['num_tokens_oai']/token_thres,0)


chapter_contents_df['contents_split'] = chapter_contents_df.apply(
    lambda x: utl.split_contents(x), axis=1)


chapter_contents_long_df = chapter_contents_df.explode(
    column='contents_split'
)[['contents_split']]

In [315]:
# Create a token count column
chapter_contents_long_df['num_tokens_oai'] = chapter_contents_long_df['contents_split'].apply(
    lambda x: utl.num_tokens_from_messages([{'message': x}])
)

chapter_contents_long_df

Unnamed: 0,contents_split,num_tokens_oai
abstract,we propose a non-representationalist framewor...,241
1 introduction,"in the past years, deep learning (dl) has ach...",1130
1 introduction,"cited paper, “deep learning” for example, anns...",986
1 introduction,,7
2 cognitivism and neuro-representationalism in deep,learning dl is concerned with the design and ...,1010
2 cognitivism and neuro-representationalism in deep,’s missing element and can be thought of as so...,1073
2 cognitivism and neuro-representationalism in deep,often thought to be the key element to human-...,986
2 cognitivism and neuro-representationalism in deep,.,8
3 the phenomenological critique of cognitivism: im-,plications for deep learning as seen in the p...,1079
3 the phenomenological critique of cognitivism: im-,discovers real objective features of the worl...,1027


In [316]:
chapter_contents_long_df['text'] = "CHAPTER: " +  chapter_contents_long_df.index + " CONTENT: " + chapter_contents_long_df['contents_split']

In [317]:
# Drop contents_split column
chapter_contents_long_df = chapter_contents_long_df.drop(columns=['contents_split'])
# Reset index so chapter names are stored in columns
chapter_contents_long_df = chapter_contents_long_df.reset_index()
# Rename index column to chapter
chapter_contents_long_df = chapter_contents_long_df.rename(columns={'index': 'chapter'})
# Add session_name column
chapter_contents_long_df['session_name'] = session_name
## Add interaction type column
chapter_contents_long_df['interaction_type'] = 'source'
## Drop rows where num_tokens_oai is less than 100
chapter_contents_long_df = chapter_contents_long_df[chapter_contents_long_df['num_tokens_oai'] > 50].copy()

In [318]:
chapter_contents_long_df

Unnamed: 0,chapter,num_tokens_oai,text,session_name,interaction_type
0,abstract,241,CHAPTER: abstract CONTENT: we propose a non-r...,computational_phenomenology,source
1,1 introduction,1130,CHAPTER: 1 introduction CONTENT: in the past ...,computational_phenomenology,source
2,1 introduction,986,"CHAPTER: 1 introduction CONTENT: cited paper, ...",computational_phenomenology,source
4,2 cognitivism and neuro-representationalism in...,1010,CHAPTER: 2 cognitivism and neuro-representatio...,computational_phenomenology,source
5,2 cognitivism and neuro-representationalism in...,1073,CHAPTER: 2 cognitivism and neuro-representatio...,computational_phenomenology,source
6,2 cognitivism and neuro-representationalism in...,986,CHAPTER: 2 cognitivism and neuro-representatio...,computational_phenomenology,source
8,3 the phenomenological critique of cognitivism...,1079,CHAPTER: 3 the phenomenological critique of co...,computational_phenomenology,source
9,3 the phenomenological critique of cognitivism...,1027,CHAPTER: 3 the phenomenological critique of co...,computational_phenomenology,source
11,4 computational phenomenology and deep learning,827,CHAPTER: 4 computational phenomenology and dee...,computational_phenomenology,source
12,4 computational phenomenology and deep learning,830,CHAPTER: 4 computational phenomenology and dee...,computational_phenomenology,source


## 2. Get embeddings for each content piece

In [319]:
contents_for_embed_df = chapter_contents_long_df[['text']]

In [320]:
# Calculate the cost of running the model to get embeddings
(chapter_contents_long_df['num_tokens_oai'].sum() / 1000) * 0.0004

0.008430400000000001

In [321]:
# Commented out because I already ran this
# If you want to get embeddings on a different file, 
# uncomment this and run it

# Embed each chapter


# rng = tqdm(range(0,len(contents_for_embed_df)))

# contents_embedded = {}

# for i in rng:
#     txt_chapter = contents_for_embed_df.index[i]
#     txt_list = contents_for_embed_df.iloc[i].to_list()

#     txt_embed = utl.get_embedding(txt_list)


# # Join embeddings with context table
# contents_embedded[txt_chapter] = txt_embed
# embeded_s = pd.Series(contents_embedded, index=contents_embedded.keys())

# # Merge embeddings with chapter contents
# chapter_contents_long_df['embedding'] = embeded_s
# chapter_contents_long_df.head()

# # Save embeddings
# chapter_contents_long_df.to_csv(f'./data/{session_name}_embeded.csv')

## 3. Find most similar document embeddings to the question embedding

In [322]:
# Read embeddings
chapter_contents_long_df = pd.read_csv(prm.D_PTH  / f'{session_name}_embeded.csv', index_col=0)

In [323]:
# DB INTERACTION
# Insert context into DB
conn = utl.create_connection(prm.DB_PTH)
utl.insert_context(conn, session_name, chapter_contents_long_df)
conn.close()

Context table for session: "computational_phenomenology" created


In [325]:
# POC - conversational interface

## User input
question = input("> ")



## Fetch recal table so we can compare user input to embeddings saved in it and fetch the right context.
recal_table = utl.fetch_recall_table(session_name)

## Chop recall table to only include contexts for sources, user, or assistant
recal_table_source = recal_table[recal_table['interaction_type'] == 'source']
recal_table_user = recal_table[recal_table['interaction_type'] == 'user']
recal_table_assistant = recal_table[recal_table['interaction_type'] == 'assistant']

recal_embed_source = utl.convert_table_to_dct(recal_table_source)
recal_embed_user = utl.convert_table_to_dct(recal_table_user)
recal_embed_assistant = utl.convert_table_to_dct(recal_table_assistant)

## Get the context from recal table that is the most similar to user input

## Get SRC context
if len(recal_embed_source) == 0:
    recal_source = 'No context found'
else:
    recal_source_id = utl.order_document_sections_by_query_similarity(question, recal_embed_source)[0][1]
    recal_source = recal_table.loc[recal_source_id]['text']
## GET QRY context
if len(recal_embed_user) == 0:
    recal_user = 'No context found'
else:
    recal_user_id = utl.order_document_sections_by_query_similarity(question, recal_embed_user)[0][1]
    recal_user = recal_table.loc[recal_user_id]['text']
## GET RPL context
if len(recal_embed_assistant) == 0:
    recal_assistant = 'No context found'
else:
    recal_assistant_id = utl.order_document_sections_by_query_similarity(question, recal_embed_assistant)[0][1]
    recal_assistant = recal_table.loc[recal_assistant_id]['text']



## Grab chapter name if it exists, otherwise use session name
## It will become handy when user wants to know from which chapter the context was taken
source_chapter_val = recal_table.loc[recal_source_id]['chapter']


###############################################################################
# Set-up system prompts. This is done once for the whole session and the setup depends on the type of assistant chosen by the user.
# I comnment this out as this needs to be saved in the interaction table and I will implement this after user-assistant interactions are implemented.
#
# sys_message = [
#     {'role': 'system', 'content': f"You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it."}
#     ]
# api_response = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=sys_message
# )
###############################################################################

## Form user message based on recaled context and user's input
usr_message = [
    {
        'role': 'system', 
        'content': "You are a helpful assistant. You provide only factual information. If you do not know the answer, you say it. I provide my input after INP tag. I will pass the context you will use in your answer. I encode it with following tags: SRC - sources we are talking about, they have the priority above other contexts; QRY - one of previous inputs I passed to you in the conversation; RPL - one of your previous replies to my questions from the conversation."
        },
    {   
        "role": "user", 
        "content": f"SRC: {recal_source}. QRY: {recal_user}. RPL: {recal_assistant}. INP: {question}"
        }
    ]
# Grab call user content from messages alias
usr_message_content = usr_message[0]['content']


## Make API call with formed user message
api_response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=usr_message
)


# Open DB so the assistant can remember the conversation
conn = utl.create_connection(prm.DB_PTH)
# Insert user message into DB so we can use it for another user's input
utl.insert_interaction(
    conn, 
    session_name, 
    'user', 
    question
    )
# Insert model's response into DB so we can use it for another user's input
utl.insert_interaction(
    conn,
    session_name,
    'assistant',
    api_response['choices'][0]['message']['content']
    )
conn.close()




## Print CALL and RESPONSE
pprint.pprint('USER:')
pprint.pprint(question)
pprint.pprint('---------------------------------')
pprint.pprint('ASSISTANT:')
pprint.pprint(api_response['choices'][0]['message']['content'])
## Print additional context data:
pprint.pprint('---------------------------------')
pprint.pprint('CONTEXT:')
pprint.pprint(f"SOURCE: {recal_source}")
pprint.pprint(f"USER: {recal_user}")
pprint.pprint(f"ASSISTANT: {recal_assistant}")


Interaction type: "user" inserted into the database for session: "computational_phenomenology"
Interaction type: "assistant" inserted into the database for session: "computational_phenomenology"
'USER:'
'What do representations of external entities mean?'
'---------------------------------'
'ASSISTANT:'
('Representations of external entities refer to encoding or symbolization of '
 'external properties or entities in the brain or artificial neural networks. '
 'According to neuro-representationalism, cognitive processes involve mental '
 'representations that encode these external properties or entities. However, '
 'the computational phenomenology approach rejects this idea and proposes a '
 'different perspective on how artificial neural networks work.')
'---------------------------------'
'CONTEXT:'
('SOURCE: CHAPTER: 2 cognitivism and neuro-representationalism in deep '
 'CONTENT: ’s missing element and can be thought of as some sort of elaborate '
 'turing machine that self-adapts