# GPT-3.5-Turbo Model
Creating a question answering chatbot using GPT-3.5. Adapted from: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

In [1]:
# Preamble
import PyPDF2 # For parsing PDF documents
import ast  # covert embeddings saved as strings back to arrays
import openai  # OpenAI API
import pandas as pd  # for storing text and embeddings data
import numpy as np # for df manipulations
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search
import wikipedia # For sourcing Wikipedia article text
import re  # for cutting <ref> links out of Wikipedia articles
import mwparserfromhell  # for splitting Wikipedia articles into sections
from copy import deepcopy # for copying dataframes

In [2]:
# Config
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
SAVE_PATH = "assets/computer_vision_wiki_embeddings.csv"
ENCODING = tiktoken.encoding_for_model(GPT_MODEL)
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request
ANSWER_NOT_FOUND_MSG = "I could not find an answer in the text I\'ve been provided, sorry! Please try again."

In [113]:
# creating a pdf reader instance
reader = PyPDF2.PdfReader('assets/online_notes.pdf')

# print the number of pages in pdf file
print(len(reader.pages))

# print the text of the first page
print(reader.pages[5].extract_text())

115
2.Veryfewvisual taskscanbesuccessfully performed inapurely data-driv en
way(\bottom-up" image analysis). Consider thenextimage example:
aged bytheirtextured backgrounds; thefoxes
occlude eachother; theyappearinseveraldieren tposesandperspective
angles; etc.Howcanthere possibly existmathematical operators forsuch
animage thatcan:
perform thegure-ground segmen tation ofthescene (intoitsobjects
andbackground)
inferthe3Darrangemen tsofobjectsfromtheirmutual occlusions
infersurface properties (texture, colour) fromthe2Dimage statistics
infervolumetric objectproperties fromtheir2Dimage projections
anddoallofthisin\real time?" (This matters quite alotinthe
natural world\redintoothandclaw,"sincesurviv aldependsonit.)
5


In [115]:
SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
]

WIKI_PAGE = "Computer vision"

def all_subsections_from_section(
    section: mwparserfromhell.wikicode.Wikicode,
    parent_titles: list[str],
    sections_to_ignore: set[str],
) -> list[tuple[list[str], str]]:
    """
    From a Wikipedia section, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    headings = [str(h) for h in section.filter_headings()]
    title = headings[0]
    if title.strip("=" + " ") in sections_to_ignore:
        # ^wiki headings are wrapped like "== Heading =="
        return []
    titles = parent_titles + [title]
    full_text = str(section)
    section_text = full_text.split(title)[1]
    if len(headings) == 1:
        return [(titles, section_text)]
    else:
        first_subtitle = headings[1]
        section_text = section_text.split(first_subtitle)[0]
        results = [(titles, section_text)]
        for subsection in section.get_sections(levels=[len(titles) + 1]):
            results.extend(all_subsections_from_section(subsection, titles, sections_to_ignore))
        return results

def all_subsections_from_title(
    title: str = WIKI_PAGE,
    sections_to_ignore: set[str] = SECTIONS_TO_IGNORE,
) -> list[tuple[list[str], str]]:
    """From a Wikipedia page title, return a flattened list of all nested subsections.
    Each subsection is a tuple, where:
        - the first element is a list of parent subtitles, starting with the page title
        - the second element is the text of the subsection (but not any children)
    """
    site = wikipedia.page(title, auto_suggest=False)
    text = site.content
    parsed_text = mwparserfromhell.parse(text)
    headings = [str(h) for h in parsed_text.filter_headings()]
    if headings:
        summary_text = str(parsed_text).split(headings[0])[0]
    else:
        summary_text = str(parsed_text)
    results = [([title], summary_text)]
    for subsection in parsed_text.get_sections(levels=[2]):
        results.extend(all_subsections_from_section(subsection, [title], sections_to_ignore))
    return results

# split pages into sections
# may take ~1 minute per 100 articles
wikipedia_sections = []
wikipedia_sections.extend(all_subsections_from_title(WIKI_PAGE))
print(f"Found {len(wikipedia_sections)} sections.")
print(wikipedia_sections)

Found 24 sections.


In [116]:
# clean text
def clean_section(section: tuple[list[str], str]) -> tuple[list[str], str]:
    """
    Return a cleaned up section with:
        - <ref>xyz</ref> patterns removed
        - leading/trailing whitespace removed
    """
    titles, text = section
    text = re.sub(r"<ref.*?</ref>", "", text)
    text = text.strip()
    return (titles, text)


wikipedia_sections = [clean_section(ws) for ws in wikipedia_sections]

# filter out short/blank sections
def keep_section(section: tuple[list[str], str]) -> bool:
    """Return True if the section should be kept, False otherwise."""
    titles, text = section
    if len(text) < 16:
        return False
    else:
        return True


original_num_sections = len(wikipedia_sections)
wikipedia_sections = [ws for ws in wikipedia_sections if keep_section(ws)]
print(f"Filtered out {original_num_sections-len(wikipedia_sections)} sections, leaving {len(wikipedia_sections)} sections.")

Filtered out 1 sections, leaving 23 sections.


In [117]:
# print example data
for ws in wikipedia_sections[:5]:
    print(ws[0])
    display(ws[1][:77] + "...")
    print()

['Computer vision']


'Computer vision tasks include methods for acquiring, processing, analyzing an...'


['Computer vision', '== Definition ==']


'Computer vision is an interdisciplinary field that deals with how computers c...'


['Computer vision', '== History ==']


'In the late 1960s, computer vision began at universities that were pioneering...'


['Computer vision', '== Related fields ==', '=== Solid-state physics ===']


'Solid-state physics is another field that is closely related to computer visi...'


['Computer vision', '== Related fields ==', '=== Neurobiology ===']


'Neurobiology has greatly influenced the development of computer vision algori...'




In [4]:
# Used throughout
def num_tokens(
        text: str,
        encoding: tiktoken.encoding_for_model = tiktoken.encoding_for_model(GPT_MODEL)
) -> int:
    """Returns the number of tokens in a string."""
    return len(encoding.encode(text))

In [118]:
# recursively split long sections into smaller sections.
def halved_by_delimiter(string: str, delimiter: str = "\n") -> list[str, str]:
    """Split a string in two, on a delimiter, trying to balance tokens on each side."""
    chunks = string.split(delimiter)
    if len(chunks) == 1:
        return [string, ""]  # no delimiter found
    elif len(chunks) == 2:
        return chunks  # no need to search for halfway point
    else:
        total_tokens = num_tokens(string)
        halfway = total_tokens // 2
        best_diff = halfway
        for i, chunk in enumerate(chunks):
            left = delimiter.join(chunks[: i + 1])
            left_tokens = num_tokens(left)
            diff = abs(halfway - left_tokens)
            if diff >= best_diff:
                break
            else:
                best_diff = diff
        left = delimiter.join(chunks[:i])
        right = delimiter.join(chunks[i:])
        return [left, right]


def truncated_string(
    string: str,
    model: str,
    max_tokens: int,
    print_warning: bool = True,
) -> str:
    """Truncate a string to a maximum number of tokens."""
    encoding = tiktoken.encoding_for_model(model)
    encoded_string = encoding.encode(string)
    truncated_string = encoding.decode(encoded_string[:max_tokens])
    if print_warning and len(encoded_string) > max_tokens:
        print(f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.")
    return truncated_string


def split_strings_from_subsection(
    subsection: tuple[list[str], str],
    max_tokens: int = 1000,
    model: str = GPT_MODEL,
    max_recursion: int = 5,
) -> list[str]:
    """
    Split a subsection into a list of subsections, each with no more than max_tokens.
    Each subsection is a tuple of parent titles [H1, H2, ...] and text (str).
    """
    titles, text = subsection
    string = "\n\n".join(titles + [text])
    num_tokens_in_string = num_tokens(string)
    # if length is fine, return string
    if num_tokens_in_string <= max_tokens:
        return [string]
    # if recursion hasn't found a split after X iterations, just truncate
    elif max_recursion == 0:
        return [truncated_string(string, model=model, max_tokens=max_tokens)]
    # otherwise, split in half and recurse
    else:
        titles, text = subsection
        for delimiter in ["\n\n", "\n", ". "]:
            left, right = halved_by_delimiter(text, delimiter=delimiter)
            if left == "" or right == "":
                # if either half is empty, retry with a more fine-grained delimiter
                continue
            else:
                # recurse on each half
                results = []
                for half in [left, right]:
                    half_subsection = (titles, half)
                    half_strings = split_strings_from_subsection(
                        half_subsection,
                        max_tokens=max_tokens,
                        model=model,
                        max_recursion=max_recursion - 1,
                    )
                    results.extend(half_strings)
                return results
    # otherwise no split was found, so just truncate (should be very rare)
    return [truncated_string(string, model=model, max_tokens=max_tokens)]

# split sections into chunks
MAX_TOKENS = 1600
wikipedia_strings = []
for section in wikipedia_sections:
    wikipedia_strings.extend(split_strings_from_subsection(section, max_tokens=MAX_TOKENS))

print(f"{len(wikipedia_sections)} Wikipedia sections split into {len(wikipedia_strings)} strings.")

23 Wikipedia sections split into 23 strings.


In [119]:
# print example data
print(wikipedia_strings[1])

Computer vision

== Definition ==

Computer vision is an interdisciplinary field that deals with how computers can be made to gain high-level understanding from digital images or videos. From the perspective of engineering, it seeks to automate tasks that the human visual system can do. "Computer vision is concerned with the automatic extraction, analysis and understanding of useful information from a single image or a sequence of images. It involves the development of a theoretical and algorithmic basis to achieve automatic visual understanding." As a scientific discipline, computer vision is concerned with the theory behind artificial systems that extract information from images. The image data can take many forms, such as video sequences, views from multiple cameras, or multi-dimensional data from a medical scanner. As a technological discipline, computer vision seeks to apply its theories and models for the construction of computer vision systems.


In [120]:
# Now that we've split our library into shorter self-contained strings, we can compute embeddings for each.
# calculate embeddings

embeddings = []
for batch_start in range(0, len(wikipedia_strings), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = wikipedia_strings[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": wikipedia_strings, "embedding": embeddings})

Batch 0 to 999


In [147]:
df

Unnamed: 0,text,embedding,tokens,source
0,Article section:\n\nComputer vision\n\nCompute...,"[-0.015160824172198772, 0.003632682841271162, ...",291,Wikipedia
1,Article section:\n\nComputer vision\n\n== Defi...,"[-0.006095150485634804, 0.0008000890375114977,...",167,Wikipedia
2,Article section:\n\nComputer vision\n\n== Hist...,"[-0.0071233357302844524, -0.003958832006901503...",516,Wikipedia
3,Article section:\n\nComputer vision\n\n== Rela...,"[0.0027573341503739357, -0.0004955596523359418...",136,Wikipedia
4,Article section:\n\nComputer vision\n\n== Rela...,"[-0.0025333885569125414, -0.005423863884061575...",308,Wikipedia
5,Article section:\n\nComputer vision\n\n== Rela...,"[-0.024513425305485725, 0.0005832292954437435,...",118,Wikipedia
6,Article section:\n\nComputer vision\n\n== Rela...,"[0.0037523966748267412, -0.018352391198277473,...",80,Wikipedia
7,Article section:\n\nComputer vision\n\n== Rela...,"[0.003763288725167513, -0.004463814198970795, ...",134,Wikipedia
8,Article section:\n\nComputer vision\n\n== Rela...,"[-0.01610185205936432, -0.0026065742131322622,...",654,Wikipedia
9,Article section:\n\nComputer vision\n\n== Appl...,"[-0.011650911532342434, 0.0027732541784644127,...",281,Wikipedia


In [122]:
# save document chunks and embeddings
df.to_csv(SAVE_PATH, index=False)

# Search
Now we'll define a search function that:

Takes a user query and a dataframe with text & embedding columns
Embeds the user query with the OpenAI API
Uses distance between query embedding and text embeddings to rank the texts
Returns two lists:
The top N texts, ranked by relevance
Their corresponding relevance scores

In [10]:
class ChatBot:
    def __init__(self, chatbot_topic:str, knowledge_path: str):
        self.knowledge = None
        self.load_data(knowledge_path)
        self.chatbot_topic = chatbot_topic

    def load_data(self, path: str):
        """Loads the knowledge df, appends a prefix, and calculates the number of tokens per section of knowledge"""

        # load data from csv
        self.knowledge = pd.read_csv(path)
        # convert embeddings from CSV str type back to list type
        self.knowledge['embedding'] = self.knowledge['embedding'].apply(ast.literal_eval) # is this needed?

        # Format the knowledge df by adding section prefix and token sizes
        self.knowledge['text'] = 'Article section:\n\n' + self.knowledge['text']
        self.knowledge['tokens'] = self.knowledge["text"].apply(lambda x: num_tokens(x))
        self.knowledge['source'] = 'Wikipedia'

In [11]:
class Query:
    def __init__(self, query_text: str, chatbot_instance: ChatBot):
        self.content: str = query_text
        self.model: str = GPT_MODEL
        self.knowledge: pd.DataFrame = chatbot_instance.knowledge
        self.token_limit: int = 4096 - 500 # Allows 500 for the response
        self.gpt_message = None
        self.knowledge_used = None

    # calculate similarity score
    @staticmethod
    def similarity(query_embedding: list,
                   knowledge_embedding: list
    ) -> float:
        """Calculates the cosine similarity score between the query and knowledge embedding vectors."""

        return 1- spatial.distance.cosine(query_embedding, knowledge_embedding)

    # find the most similar sections of knowledge to the query
    def knowledge_ranked_by_similarity(self,
        max_num_sections: int = 5
    ): # -> pd.DataFrame
        """Take the raw knowledge dataframe, calculates similarity scores between the query and the sections, and returns a dataframe ordered from highest to lowest in terms of similarity."""

        query_embedding_response = openai.Embedding.create(
            model=EMBEDDING_MODEL,
            input=self.content,
        )
        query_embedding = query_embedding_response["data"][0]["embedding"]
        knowledge_with_similarities = deepcopy(self.knowledge) # To prevent adapting the original dataframe
        knowledge_with_similarities["similarity"] = knowledge_with_similarities["embedding"].apply(lambda x: self.similarity(query_embedding, x))
        knowledge_with_similarities.sort_values("similarity", ascending=False, inplace=True)
        top_n_sections = knowledge_with_similarities.head(max_num_sections)
        self.knowledge_used = top_n_sections

    def get_gpt_message(
            self,
            chatbot_topic: str
    ): # -> tuple[str, pd.DataFrame]
        """Uses the most relevant texts from the knowledge dataframe to construct a message that can then be fed into GPT."""

        self.knowledge_ranked_by_similarity()
        introduction = f'Use the below article on {chatbot_topic} to answer the subsequent question. If the answer cannot be found in the articles, write "{ANSWER_NOT_FOUND_MSG}". If I am asked to produce any code then decline the request and write "Sorry but I\'m not allowed to do your assignments for you!"' # The longer this is, the more tokens it uses!
        question = f"\n\nQuestion: {self.content}"

        # Ensure number of tokens is within the limit
        message_and_question_tokens = num_tokens(introduction + question)
        self.knowledge_used['cumulative_tokens'] = self.knowledge_used['tokens'].cumsum()
        self.knowledge_used['cumulative_tokens'] += message_and_question_tokens # add the inital number of tokens
        self.knowledge_used= self.knowledge_used.loc[self.knowledge_used['cumulative_tokens']<self.token_limit]

        # Construct output
        combined_knowledge_string = ''.join(list(self.knowledge_used['text']))
        combined_knowledge_string = '\n\n' + combined_knowledge_string
        self.gpt_message = introduction + combined_knowledge_string + question

    @classmethod
    def ask(
            cls,
            query_text: str,
            chatbot_instance: ChatBot,
            show_source: bool = True,
    ) -> str:
        """Uses GPT and a dataframe of relevant texts/embeddings (its knowledge) to answer a query."""

        query = cls(query_text, chatbot_instance)
        query.get_gpt_message(chatbot_instance.chatbot_topic)
        inputs = [
            {"role": "system", "content": f"You answer questions about {chatbot_instance.chatbot_topic}."},
            {"role": "user", "content": query.gpt_message},
        ]
        response = openai.ChatCompletion.create(
            model=query.model,
            messages=inputs,
            temperature=0 # We don't want any creativity in the answers
        )
        response_message = response["choices"][0]["message"]["content"]
        total_tokens_used = response['usage']['total_tokens']
        if show_source and response_message!=ANSWER_NOT_FOUND_MSG: # Display the sources used:
            query.knowledge_used['index'] = np.arange(len(query.knowledge_used))+1
            query.knowledge_used['output'] = '\n\n' + query.knowledge_used['index'].astype(str) + '. ' + query.knowledge_used['source'] + ' (Specifically: ' + query.knowledge_used['text'].str[len('Article section: \n'):100] + '...)'
            sources_string = ''.join(list(query.knowledge_used['output']))
            response_message += f'\n\nTo construct this answer, I used the following documents: {sources_string}'
        response_message += f"\n\nTotal tokens used: {total_tokens_used}"
        return response_message

CompVisionGPT = ChatBot("Computer Vision", SAVE_PATH)
print(Query.ask('When did universities begin teaching Computer Vision?', CompVisionGPT, show_source=True))
# I need to make it more efficient on the number of tokens.

# Todo:
# Substantially tweak the chunking code
# Adapt it for more sources (e.g. PDF)

Computer vision began at universities that were pioneering artificial intelligence in the late 1960s.

To construct this answer, I used the following documents: 

1. Wikipedia (Specifically: Computer vision

== History ==

In the late 1960s, computer vision began at univer...)

2. Wikipedia (Specifically: Computer vision

== Related fields ==

=== Neurobiology ===

Neurobiology has grea...)

3. Wikipedia (Specifically: Computer vision

== Related fields ==

=== Solid-state physics ===

Solid-state ph...)

4. Wikipedia (Specifically: Computer vision

== Related fields ==

=== Other fields ===

Besides the above-men...)

5. Wikipedia (Specifically: Computer vision

== Hardware ==

There are many kinds of computer vision systems; ...)

Total tokens used: 1612


In [9]:
CompVisionGPT.knowledge

Unnamed: 0,text,embedding,tokens,source
0,Article section:\n\nComputer vision\n\nCompute...,"[-0.015160824172198772, 0.003632682841271162, ...",291,Wikipedia
1,Article section:\n\nComputer vision\n\n== Defi...,"[-0.006095150485634804, 0.0008000890375114977,...",167,Wikipedia
2,Article section:\n\nComputer vision\n\n== Hist...,"[-0.0071233357302844524, -0.003958832006901503...",516,Wikipedia
3,Article section:\n\nComputer vision\n\n== Rela...,"[0.0027573341503739357, -0.0004955596523359418...",136,Wikipedia
4,Article section:\n\nComputer vision\n\n== Rela...,"[-0.0025333885569125414, -0.005423863884061575...",308,Wikipedia
5,Article section:\n\nComputer vision\n\n== Rela...,"[-0.024513425305485725, 0.0005832292954437435,...",118,Wikipedia
6,Article section:\n\nComputer vision\n\n== Rela...,"[0.0037523966748267412, -0.018352391198277473,...",80,Wikipedia
7,Article section:\n\nComputer vision\n\n== Rela...,"[0.003763288725167513, -0.004463814198970795, ...",134,Wikipedia
8,Article section:\n\nComputer vision\n\n== Rela...,"[-0.01610185205936432, -0.0026065742131322622,...",654,Wikipedia
9,Article section:\n\nComputer vision\n\n== Appl...,"[-0.011650911532342434, 0.0027732541784644127,...",281,Wikipedia


In [7]:
print(Query.ask('Who is Boris Johnson', CompVisionGPT, show_source=True))

I could not find an answer in the text I've been provided, sorry! Please try again.

Total tokens used: 1460
