# GPT-3.5-Turbo Model
Creating a question answering chatbot using GPT-3.5. Adapted from: https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

Text here

In [6]:
# Preamble
# pip install PyPDF2 openai wikipedia mwparserfromhell transformers torch pandas scipy # tiktoken
import PyPDF2 # For parsing PDF documents!
import ast  # covert embeddings saved as strings back to arrays
import openai  # OpenAI API
import pandas as pd  # for storing text and embeddings data
import numpy as np # for df manipulations
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search
import wikipedia # For sourcing Wikipedia article text
import re  # for cutting <ref> links out of Wikipedia articles
import mwparserfromhell  # for splitting Wikipedia articles into sections
from copy import deepcopy # for copying dataframes
import torch # for BERT's argmax and tensors
from transformers import BertForQuestionAnswering, BertTokenizer # For BERT's tokeniser and model
from transformers import BartTokenizer, BartForConditionalGeneration # For BART's tokeniser and model
import torch # For creating neural networks with GPUs
import logging # For showing messages in the console

In [9]:
# Logging and GPU setup
logging.basicConfig(filename='main.log', level=logging.DEBUG) # , encoding='utf-8'
def log_and_print_message(msg):
        print(msg)
        logging.warning(msg)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
        log_and_print_message(f'Using GPU - details are as follows:')
        log_and_print_message(f'__CUDNN VERSION: {torch.backends.cudnn.version()}')
        log_and_print_message(f'__Number CUDA Devices: {torch.cuda.device_count()}')
        log_and_print_message(f'__CUDA Device Name: {torch.cuda.get_device_name(0)}')
        log_and_print_message(f'__CUDA Device Total Memory: {torch.cuda.get_device_properties(0).total_memory/1e9}')
        # model = NeuralNet(input_size, hidden_size, output_size).to(device)
else:
        log_and_print_message(f'No GPU - available, using a CPU')

No GPU - available, using a CPU


In [8]:
# Config
GPT_EMBEDDING_MODEL = "text-embedding-ada-002"
BERT_EMBEDDING_MODEL = 'bert-base-nli-mean-tokens'
GPT_MODEL = "gpt-3.5-turbo"
BERT_MODEL = "deepset/bert-base-cased-squad2"
BART_MODEL = 'vblagoje/bart_lfqa'
GPT_KNOWLEDGE_FILENAME = "CompVisionGPT.csv"
BERT_KNOWLEDGE_FILENAME = "CompVisionBERT.csv"
BERT_ENCODING = BertTokenizer.from_pretrained(BERT_MODEL)
GPT_ENCODING = tiktoken.encoding_for_model(GPT_MODEL)
BART_ENCODING = BartTokenizer.from_pretrained(BART_MODEL)
GPT_MAX_SECTION_TOKENS = 1600 # max number of tokens per section
GPT_QUERY_TOKEN_LIMIT = 4096 - 500 # Allows 500 for the response
BERT_MAX_SECTION_TOKENS = 460 # max tokens per section
# Need to include a check to ensure that the section length is less than the query length (plus the preamble for GPT)
MIN_LENGTH = 50 # min CHARACTER length for each section
ANSWER_NOT_FOUND_MSG = "I could not find an answer in the text I\'ve been provided, sorry! Please try again."
WIKI_PAGES = [
    'Computer vision',
    'Databases and indexing related concepts', 
    'Generic computer vision methods',
    'Geometric and other image features and methods',
    'Geometry and mathematics',
    'Image physics related concepts',
    'Image Processing Architectures & Control Structures',
    'Image transformations and filters',
    'Introductory visual neurophysiology',
    'Introductory visual psychophysics/psychology',
    'Motion and time sequence analysis related concepts',
    'Non-sequential realization methods',
    'Object, world and scene representations',
    'Recognition and registration methods',
    'Scene understanding/image analysis methods',
    'Sensor fusion, registration and planning methods',
    'Sensors and properties',
    'System models, calibration and parameter estimation methods',
    'Visual learning related methods and concepts'
    ]
WIKI_PAGE = "Computer vision"
SECTIONS_TO_IGNORE = [
    "See also",
    "References",
    "External links",
    "Further reading",
    "Footnotes",
    "Bibliography",
    "Sources",
    "Citations",
    "Literature",
    "Footnotes",
    "Notes and references",
    "Photo gallery",
    "Works cited",
    "Photos",
    "Gallery",
    "Notes",
    "References and sources",
    "References and notes",
    ]

OSError: Could not find a suitable TLS CA certificate bundle, invalid path: c:\Anaconda3\lib\site-packages\certifi\cacert.pem

In [4]:
# creating a pdf reader instance
output_file_folder = 'assets/'
uploaded_file = 'assets/online_notes.pdf'
# reader = PyPDF2.PdfReader(uploaded_file)
#
# # print the number of pages in pdf file
# print(len(reader.pages))
#
# # print the text of the first page
# print(reader.pages[5].extract_text())

# from pypdf import PdfReader
#
# reader = pypdf.PdfReader(uploaded_file)
# text = ""
# for page in reader.pages:
#     text += page.extract_text() + "\n"
# text

# import fitz

# text = ""
#
# doc = fitz.open(uploaded_file)
# from unidecode import unidecode
# output = []
# for page in doc:
#     output += page.get_text("blocks")
# previous_block_id = 0 # Set a variable to mark the block id
# for block in output:
#     if block[6] == 0: # We only take the text
#         if previous_block_id != block[5]: # Compare the block number
#            print("\n")
#            plain_text = unidecode(block[4])
#            print(plain_text)

import fitz
import pandas as pd
import re
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []

# create an empty DataFrame
df = pd.DataFrame(columns=['Page', 'Section', 'Block text', 'Span text', 'Font size', 'Rectangle'])
block_dict = {}

# FIX THIS STUFF!!

# open the PDF file
with fitz.open(uploaded_file) as doc:
    page_num = 1

    # iterate through the pages
    for i, page in enumerate(doc):

        # get the page blocks
        output = page.get_text('dict')
        block = output['blocks']
        block_dict[page_num] = block
        page_num += 1

        # iterate through the blocks
        for b in blocks:

            # check if the block is a section header
            if b['size'] > 14 and b['height'] > 25:

                # add the section header to the DataFrame
                df = df.append({'Page': i+1, 'Section': b['text'], 'Block text': '', 'Span text': '',
                                'Font size': '', 'Rectangle': b['rect']}, ignore_index=True)

            # get the block text and rectangle coordinates
            block_text = b['text']
            block_rect = b['rect']

            # iterate through the block's spans
            for s in b['spans']:

                # get the span text and font size
                span_text = s['text']
                span_size = s['size']

                # add the information to the DataFrame
                df = df.append({'Page': i+1, 'Section': '', 'Block text': block_text, 'Span text': span_text,
                                'Font size': span_size, 'Rectangle': s['rect']}, ignore_index=True)

# print the DataFrame
print(df)

NameError: name 'blocks' is not defined

In [57]:
def is_float(value):
    """
    Check if a value is a float.
    """
    try:
        float(value)
        return True
    except ValueError:
        return False

from unidecode import unidecode # parsing symbols
doc = fitz.open(uploaded_file)
page_num = 0
page_numbers = []
content = []
output = []
for page in doc:
    page_num += 1
    blocks = page.get_text("blocks")
    for block in blocks:
        if block[6]==0:
            block_content = unidecode(block[4])
            stripped_block_content = block_content.replace('\n', '')
            if not stripped_block_content.isdigit() and not is_float(stripped_block_content):
                content.append(block_content)
                page_numbers.append(page_num)
            else:
                pass
    # previous_block_id = 0 # Set a variable to mark the block id
# for block in output:
#      if block[6] == 0: # We only take the text
#           if previous_block_id != block[5]: # Compare the block number
#               # print("\n")
#             pass
#           plain_text = unidecode(block[4])
#           page_numbers.append(page_num)
#           content.append(plain_text)
content_df = pd.DataFrame(
    {'Page Numbers': page_numbers,
     'Content': content,
    })
content_df['no_newlines'] = content_df['Content'].str.replace(r'\n', '', regex=True)
content_df

Unnamed: 0,Page Numbers,Content,no_newlines
0,1,Computer Vision\n,Computer Vision
1,1,Computer Science Tripos: 16 Lectures by J G Da...,Computer Science Tripos: 16 Lectures by J G Da...
2,1,1. Overview. Goals of computer vision; why the...,1. Overview. Goals of computer vision; why the...
3,1,"2. Image sensing, pixel arrays, CCD cameras. I...","2. Image sensing, pixel arrays, CCD cameras. I..."
4,1,"3. Biological visual mechanisms, from retina t...","3. Biological visual mechanisms, from retina t..."
...,...,...,...
1016,115,(i)\nThey are both linear integral expressions...,"(i)They are both linear integral expressions, ..."
1017,115,"(ii) In each case, the projection coefficient ...","(ii) In each case, the projection coefficient ..."
1018,115,(iii) The orthogonal basis for eigenface compu...,(iii) The orthogonal basis for eigenface compu...
1019,115,(iv) The eigenface representation does not use...,(iv) The eigenface representation does not use...


In [18]:
block_dict = {}
page_num = 1
for page in doc: # Iterate all pages in the document
      file_dict = page.get_text('dict') # Get the page dictionary
      block = file_dict['blocks'] # Get the block information
      block_dict[page_num] = block # Store in block dictionary
      page_num += 1 # Increase the page value by 1

In [33]:
import re
# spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_upper = False

                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
                        is_upper = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])


KeyboardInterrupt



In [21]:
span_df

Unnamed: 0,xmin,ymin,xmax,ymax,text,is_upper,is_bold,span_font,font_size
0,207.863998,50.356079,382.440033,71.039337,Computer Vision,False,False,CMBX12,20.6626
1,57.599998,99.768372,532.429199,117.001083,Computer Science Tripos: 16 Lectures by J G Da...,False,False,CMBX12~c,17.2155
2,70.253998,154.588379,527.770264,168.948914,1. Overview. Goals of computer vision; why the...,False,False,CMBX12~f,14.3462
3,70.254364,181.489380,497.032715,195.849915,"2. Image sensing, pixel arrays, CCD cameras. I...",False,False,CMBX12~f,14.3462
4,70.254730,208.381409,517.157776,222.741943,"3. Biological visual mechanisms, from retina t...",False,False,CMBX12~f,14.3462
...,...,...,...,...,...,...,...,...,...
6853,82.503143,592.908875,517.999084,604.875916,representations. But the added difficulty of t...,False,False,CMR12~330,11.9551
6854,82.503540,606.858887,518.052673,618.825928,"optics:"" building a 3-dimensional model from a...",False,False,CMR12~330,11.9551
6855,82.504028,620.808838,518.081665,632.775879,them). Besides the enormous computational and ...,False,False,CMR12~330,11.9551
6856,82.504242,634.758850,432.174744,646.725891,"for each such full 3D model), this is inherent...",False,False,CMR12~330,11.9551


In [22]:
span_scores = []
span_num_occur = {}
special = '[(_:/,#%\=@)]'
for index, span_row in span_df.iterrows():
    score = round(span_row.font_size)
    text = span_row.text
    if not re.search(special, text):
        if span_row.is_bold:
            score +=1
        if span_row.is_upper:
            score +=1
    span_scores.append(score)

In [23]:
# import numpy as np
values, counts = np.unique(span_scores, return_counts=True)
style_dict = {}

for value, count in zip(values, counts):
    style_dict[value] = count
sorted(style_dict.items(), key=lambda x: x[1])

[(21, 1),
 (9, 4),
 (16, 5),
 (17, 9),
 (7, 48),
 (6, 50),
 (13, 86),
 (11, 176),
 (15, 215),
 (8, 268),
 (10, 382),
 (12, 1937),
 (14, 3677)]

In [24]:
p_size = max(style_dict, key=style_dict.get)
idx = 0
tag = {}

for size in sorted(values, reverse = True):
    idx += 1
    if size == p_size:
        idx = 0
        tag[size] = 'p'
    if size > p_size:
        tag[size] = 'h{0}'.format(idx)
    if size < p_size:
        tag[size] = 's{0}'.format(idx)

In [25]:
tag

{21: 'h1',
 17: 'h2',
 16: 'h3',
 15: 'h4',
 14: 'p',
 13: 's1',
 12: 's2',
 11: 's3',
 10: 's4',
 9: 's5',
 8: 's6',
 7: 's7',
 6: 's8'}

In [27]:
span_tags = [tag[score] for score in span_scores]

span_df['tag'] = span_tags
span_df = span_df.loc[span_df['tag'].str.startswith('s')]

Unnamed: 0,xmin,ymin,xmax,ymax,text,is_upper,is_bold,span_font,font_size,tag
0,207.863998,50.356079,382.440033,71.039337,Computer Vision,False,False,CMBX12,20.6626,h1
1,57.599998,99.768372,532.429199,117.001083,Computer Science Tripos: 16 Lectures by J G Da...,False,False,CMBX12~c,17.2155,h2
2,70.253998,154.588379,527.770264,168.948914,1. Overview. Goals of computer vision; why the...,False,False,CMBX12~f,14.3462,p
3,70.254364,181.489380,497.032715,195.849915,"2. Image sensing, pixel arrays, CCD cameras. I...",False,False,CMBX12~f,14.3462,p
4,70.254730,208.381409,517.157776,222.741943,"3. Biological visual mechanisms, from retina t...",False,False,CMBX12~f,14.3462,p
...,...,...,...,...,...,...,...,...,...,...
6853,82.503143,592.908875,517.999084,604.875916,representations. But the added difficulty of t...,False,False,CMR12~330,11.9551,s2
6854,82.503540,606.858887,518.052673,618.825928,"optics:"" building a 3-dimensional model from a...",False,False,CMR12~330,11.9551,s2
6855,82.504028,620.808838,518.081665,632.775879,them). Besides the enormous computational and ...,False,False,CMR12~330,11.9551,s2
6856,82.504242,634.758850,432.174744,646.725891,"for each such full 3D model), this is inherent...",False,False,CMR12~330,11.9551,s2


In [32]:
headings_list = []
text_list = []
tmp = []
heading = ''
for index, span_row in span_df.iterrows():
    text = span_row.text
    tag = span_row.tag
    if 'h' in tag:
        headings_list.append(text)
        text_list.append('\n'.join(tmp))
        tmp = []
        heading = text
    else:
        tmp.append(text)
text_list.append('\n'.join(tmp))
text_list = text_list[1:]
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )
text_df

Exception ignored in: <function Document.__del__ at 0x000001D6088198B0>
Traceback (most recent call last):
  File "C:\Users\point\anaconda3\lib\site-packages\fitz\fitz.py", line 5761, in __del__
    if not type(self) is Document:
KeyboardInterrupt: 


KeyboardInterrupt: 

In [30]:
# Used throughout
def num_tokens(
        text: str,
        token_model = GPT_ENCODING
) -> int:
    """Returns the number of tokens in a string."""
    if token_model == GPT_ENCODING:
        return len(token_model.encode(text))
    elif token_model == BERT_ENCODING:
        return len(token_model.tokenize(text))

from sentence_transformers import SentenceTransformer
def get_embedding(content: list or str, embedding_model: str = GPT_EMBEDDING_MODEL):
    if embedding_model == GPT_EMBEDDING_MODEL:
        return openai.Embedding.create(input=content, model=embedding_model)
    else:
        similarity_model = SentenceTransformer(embedding_model)
        return similarity_model.encode(content)

In [19]:
class Knowledge:
    def __init__(self, topic, model):
        self.topic: str = topic
        self.model: str = model
        self.token_model = self.get_token_model()
        self.embedding_model: str = self.get_embedding_model()
        self.df: pd.DataFrame = self.get_blank_knowledge_df() # need to add code to remove small sections (<16 chars?)
        self.max_tokens: int = self.get_max_tokens() # max number of tokens per section
        self.min_section_length = MIN_LENGTH # min character length for each section

    def get_token_model(self):
        return GPT_ENCODING if self.model=='GPT' else BERT_ENCODING

    def get_max_tokens(self):
        return GPT_MAX_SECTION_TOKENS if self.model=='GPT' else BERT_MAX_SECTION_TOKENS

    def get_embedding_model(self):
        return GPT_EMBEDDING_MODEL if self.model=='GPT' else BERT_EMBEDDING_MODEL

    def get_blank_knowledge_df(self) -> pd.DataFrame:
        return pd.DataFrame(columns=['Source', 'Heading', 'Subheading', 'Content'])

    def extract_wiki_sections(self,
                              page_name: str,
                              content: mwparserfromhell.wikicode.Wikicode,
                              sections_to_ignore: list = SECTIONS_TO_IGNORE
                              ) -> pd.DataFrame:
        """Creates a df of sections by extracting section content from a Wikicode"""

        knowledge = self.get_blank_knowledge_df()
        for section in content.get_sections(levels=[2]):
            section_headings = section.filter_headings()
            section_header = str(section_headings[0])
            if len(section_headings)==1:# therefore a section title, not a subsection
                section = section.strip(section_header)
                if section_header.strip("=" + " ") not in sections_to_ignore: # append to df
                    new_row = {'Source': f'Wikipedia ({page_name})', 'Heading': section_header.strip("=" + " "), 'Content': section}
                    knowledge = pd.concat([knowledge, pd.DataFrame.from_records([new_row])])
            elif len(section_headings)>1 and section_header.strip("=" + " ") not in sections_to_ignore: # therefore subsections
                # Append the text before the first subsection
                initial_text = section.split(str(section_headings[1]))[0]
                initial_text = initial_text.strip(section_header)
                new_row = {'Source': f'Wikipedia ({page_name})', 'Heading': section_header.strip("=" + " "), 'Content': initial_text}
                knowledge = pd.concat([knowledge, pd.DataFrame.from_records([new_row])])
                for subsection in section.get_sections(levels=[3]):
                    subsection_sections = subsection.get_sections(levels=[3])[0]
                    subsection_headings = subsection_sections.filter_headings()
                    subsection_header = str(subsection_headings[0])
                    subsection = subsection.strip(subsection_header)
                    if subsection_header.strip("=" + " ") not in sections_to_ignore: # append to df
                        new_row = {'Source': f'Wikipedia ({page_name})', 'Heading': section_header.strip("=" + " "), 'Subheading': subsection_header.strip("=" + " "), 'Content': subsection}
                        knowledge = pd.concat([knowledge, pd.DataFrame.from_records([new_row])])
        return knowledge

    def generate_source_column(self, df: pd.DataFrame) -> pd.DataFrame:
        """Creates a new column in the df which contains a summary of the source location"""

        df.fillna('', inplace=True)
        df['Section'] = df['Source'] + '->' + df['Heading'] + '->' + df['Subheading']
        df['Section'] = df['Section'].str.replace('->->', '')
        df['Section'] = df['Section'].str.rstrip('_->')
        return df

    def clean_section_contents(self, df: pd.DataFrame) -> pd.DataFrame:
        """Returns a cleaned up section with <ref>xyz</ref> patterns and leading/trailing whitespace removed"""

        # text = re.sub(r"<ref.*?</ref>", "", text)
        df['Content'] = df['Content'].str.replace(r"<ref.*?</ref>", "", regex=True)
        df['Content'] = df['Content'].str.strip() # removes whitespace
        df['Content'] = '\n' + df['Content'] # need to add the \n back to the start of each title
        return df

    def merge_elements_of_list(self, list_of_strings: list, delimiter: str = "\n"):
        potential_for_more_merging = False
        merged_list = []
        skip_item = False
        for i in range(len(list_of_strings)):
            if not skip_item:
                if i == len(list_of_strings)-1:
                    merged_list.append(list_of_strings[i])
                else:
                    merged_strings = list_of_strings[i] + delimiter + list_of_strings[i+1]
                    if num_tokens(merged_strings)<self.max_tokens:
                        merged_list.append(merged_strings)
                        skip_item = True # make it skip the element we just merged
                        potential_for_more_merging = True
                    else:
                        merged_list.append(list_of_strings[i])
            else:
                skip_item = False # set the default back to False unless otherwise specified
        return merged_list, potential_for_more_merging

    def force_split_string(self,
                           string: str,
                           encoding = GPT_ENCODING) -> list:
        """Force a section to be split into 2 (to be used if it has no delimiter)"""

        list_of_strings = []
        if num_tokens(string) <= self.max_tokens:
            return [string]
        else:
            needs_truncating = True
            while needs_truncating:
                encoded_string = encoding.encode(string)
                truncated_string = encoding.decode(encoded_string[:self.max_tokens])
                remainder_of_string = encoding.decode(encoded_string[self.max_tokens:])
                list_of_strings.append(truncated_string)
                string = remainder_of_string
                if num_tokens(remainder_of_string)<self.max_tokens:
                    needs_truncating=False
                    list_of_strings.append(remainder_of_string)
        return list_of_strings

    def split_long_sections(self, df: pd.DataFrame, delimiter: str = '\n'):
        """Splits long sections of text into smaller ones"""

        new_dict_of_shorter_sections = self.get_blank_knowledge_df().to_dict('records')
        df_as_dict = df.to_dict('records')
        for section in df_as_dict:
            # for delimiter in delimiters:
            if section['Tokens']<=self.max_tokens:
                new_dict_of_shorter_sections.append(section)
            else:
                # needs to be split up
                if delimiter == '': # meaning that we just need to truncate it.
                    text = self.force_split_string(section['Content'])
                else:
                    text = section['Content'].split(delimiter)
                    if delimiter == '. ':
                        for i in range(len(text)-1):
                            text[i] += delimiter
                potential_for_more_merging = True
                i = 0
                while potential_for_more_merging:
                    if i>20:
                        break
                    else:
                        text, potential_for_more_merging = self.merge_elements_of_list(text)

                # The sections should be merged into acceptable sizes:
                if len(text)>1:
                    for string in text:
                        item_to_append = {'Source': section['Source'], 'Heading': section['Heading'], 'Subheading': section['Subheading'], 'Content': string, 'Section': section['Section'], 'Tokens': num_tokens(string)}

                        new_dict_of_shorter_sections.append(item_to_append)
                else:
                    item_to_append = {'Source': section['Source'], 'Heading': section['Heading'], 'Subheading': section['Subheading'], 'Content': text[0], 'Section': section['Section'], 'Tokens': num_tokens(text[0])}
                    new_dict_of_shorter_sections.append(item_to_append) # we shouldn't have this because the text should be more than the acceptable number of tokens
        return pd.DataFrame(new_dict_of_shorter_sections)

    def append_wikipedia_page(self, page_name: str,
                              sections_to_ignore: list = SECTIONS_TO_IGNORE):
        """Takes a wikipedia page and appends the sections to the knowledge df"""
        try:
            site = wikipedia.page(page_name, auto_suggest=False)
            text = site.content
            parsed_text = mwparserfromhell.parse(text)

            # Creating initial df and appending the introduction paragraph (the text up to the first heading)
            intro = str(parsed_text).split(str(parsed_text.filter_headings()[0]))[0]
            knowledge = self.get_blank_knowledge_df()
            new_row = {'Source': f'Wikipedia ({page_name})', 'Content': '\n'+intro}
            knowledge = pd.concat([knowledge, pd.DataFrame.from_records([new_row])])

            section_content = self.extract_wiki_sections(page_name=page_name, content=parsed_text, sections_to_ignore=sections_to_ignore)
            knowledge = pd.concat([knowledge, section_content])

            # Generate succinct heading information
            knowledge = self.generate_source_column(knowledge)
            self.df = pd.concat([self.df, knowledge])

            # Remove unwanted strings and whitespace
            self.df = self.clean_section_contents(self.df)

            # Generate number of tokens in each section
            self.df['Tokens'] = self.df["Content"].apply(lambda x: num_tokens(x, token_model=self.token_model))

            # Split long sections
            for delim in ["\n\n", "\n", ". ", '']:
                self.df = self.split_long_sections(self.df, delimiter=delim)

            # Remove short sections
            self.df = self.df.loc[self.df['Content'].str.len()>self.min_section_length]

            # Append '\n' to the start if it doesn't already have one
            self.df.loc[~self.df['Content'].str.startswith('\n'), 'Content'] = '\n' + self.df.loc[~self.df['Content'].str.startswith('\n'), 'Content']

            # Get embeddings
            if self.model == 'GPT':
                response = get_embedding(list(self.df['Content']), embedding_model=self.embedding_model)
                for i, be in enumerate(response["data"]):
                    assert i == be["index"]  # double check embeddings are in same order as input
                batch_embeddings = [e["embedding"] for e in response["data"]]
                self.df['Embedding'] = batch_embeddings
            else:
                self.df['Embedding'] = get_embedding(list(self.df['Content']), embedding_model=self.embedding_model).tolist()
            log_and_print_message(f'The following page has been successfully added to the knowledge database: {page_name}')

        except: # The wiki page doesn't exist
            log_and_print_message(f'The wiki page {page_name} can\'t be found. Please check and try again.')


    def export_to_csv(self, filename):
        """Saves the knowledge df to a CSV file"""
        location = 'assets/' + filename
        self.df.to_csv(location, index=False)

In [13]:
CompVisionKnowledge = Knowledge(WIKI_PAGE, 'GPT')
for page in WIKI_PAGES:
    CompVisionKnowledge.append_wikipedia_page(WIKI_PAGE)
# save document chunks and embeddings
CompVisionKnowledge.export_to_csv(GPT_KNOWLEDGE_FILENAME)
CompVisionKnowledge.df

Unnamed: 0,Source,Heading,Subheading,Content,Section,Tokens,Embedding
0,Wikipedia (Computer vision),,,\nComputer vision tasks include methods for ac...,Wikipedia (Computer vision),286,"[-0.01913553662598133, 0.002932898933067918, 0..."
1,Wikipedia (Computer vision),Definition,,\nComputer vision is an interdisciplinary fiel...,Wikipedia (Computer vision)->Definition,158,"[-0.021093836054205894, 0.0049119978211820126,..."
2,Wikipedia (Computer vision),History,,"\nIn the late 1960s, computer vision began at ...",Wikipedia (Computer vision)->History,507,"[-0.011549791321158409, -0.004044382367283106,..."
4,Wikipedia (Computer vision),Related fields,Solid-state physics,\nSolid-state physics is another field that is...,Wikipedia (Computer vision)->Related fields->S...,120,"[0.0018743288237601519, 0.011324070394039154, ..."
5,Wikipedia (Computer vision),Related fields,Neurobiology,\nNeurobiology has greatly influenced the deve...,Wikipedia (Computer vision)->Related fields->N...,293,"[-0.009132628329098225, 0.0011366719845682383,..."
6,Wikipedia (Computer vision),Related fields,Signal processing,\nYet another field related to computer vision...,Wikipedia (Computer vision)->Related fields->S...,103,"[-0.027298789471387863, 0.007510432507842779, ..."
7,Wikipedia (Computer vision),Related fields,Robotic navigation,\nRobot navigation sometimes deals with autono...,Wikipedia (Computer vision)->Related fields->R...,64,"[0.0034529592376202345, -0.014102335087954998,..."
8,Wikipedia (Computer vision),Related fields,Other fields,\nBesides the above-mentioned views on compute...,Wikipedia (Computer vision)->Related fields->O...,119,"[0.002435609931126237, -0.003915637265890837, ..."
9,Wikipedia (Computer vision),Related fields,Distinctions,\nThe fields most closely related to computer ...,Wikipedia (Computer vision)->Related fields->D...,639,"[-0.017207426950335503, 0.005905073136091232, ..."
10,Wikipedia (Computer vision),Applications,,\nApplications range from tasks such as indust...,Wikipedia (Computer vision)->Applications,272,"[-0.022458024322986603, 0.005672922823578119, ..."


In [20]:
CompVisionKnowledgeBERT = Knowledge(WIKI_PAGE, 'BERT')
CompVisionKnowledgeBERT.append_wikipedia_page(WIKI_PAGE)
# save document chunks and embeddings
CompVisionKnowledgeBERT.export_to_csv(BERT_KNOWLEDGE_FILENAME)
CompVisionKnowledgeBERT.df

Unnamed: 0,Source,Heading,Subheading,Content,Section,Tokens,Embedding
0,Wikipedia (Computer vision),,,\nComputer vision tasks include methods for ac...,Wikipedia (Computer vision),290,"[-0.5566069483757019, 0.6151323318481445, 0.70..."
1,Wikipedia (Computer vision),Definition,,\nComputer vision is an interdisciplinary fiel...,Wikipedia (Computer vision)->Definition,162,"[-0.18940897285938263, 0.5564344525337219, 0.5..."
2,Wikipedia (Computer vision),History,,"\nIn the late 1960s, computer vision began at ...",Wikipedia (Computer vision)->History,243,"[-0.5624328255653381, 0.35494446754455566, 0.6..."
3,Wikipedia (Computer vision),History,,"\nBy the 1990s, some of the previous research ...",Wikipedia (Computer vision)->History,264,"[-0.8420819044113159, 0.011862404644489288, 0...."
5,Wikipedia (Computer vision),Related fields,Solid-state physics,\nSolid-state physics is another field that is...,Wikipedia (Computer vision)->Related fields->S...,123,"[-0.1766018569469452, 0.5509802103042603, 0.11..."
6,Wikipedia (Computer vision),Related fields,Neurobiology,\nNeurobiology has greatly influenced the deve...,Wikipedia (Computer vision)->Related fields->N...,296,"[0.07454751431941986, 0.42800015211105347, 0.1..."
7,Wikipedia (Computer vision),Related fields,Signal processing,\nYet another field related to computer vision...,Wikipedia (Computer vision)->Related fields->S...,106,"[-0.1562240570783615, 0.18830031156539917, 0.4..."
8,Wikipedia (Computer vision),Related fields,Robotic navigation,\nRobot navigation sometimes deals with autono...,Wikipedia (Computer vision)->Related fields->R...,65,"[0.09209920465946198, 0.5207788944244385, 1.26..."
9,Wikipedia (Computer vision),Related fields,Other fields,\nBesides the above-mentioned views on compute...,Wikipedia (Computer vision)->Related fields->O...,122,"[-0.34635502099990845, 0.12120083719491959, 0...."
10,Wikipedia (Computer vision),Related fields,Distinctions,\nThe fields most closely related to computer ...,Wikipedia (Computer vision)->Related fields->D...,244,"[-0.011814514175057411, 0.9892112612724304, 0...."


# Search
Now we'll define a search function that:

Takes a user query and a dataframe with text & embedding columns
Embeds the user query with the OpenAI API
Uses distance between query embedding and text embeddings to rank the texts
Returns two lists:
The top N texts, ranked by relevance
Their corresponding relevance scores

In [4]:
class ChatBot:
    def __init__(self, chatbot_topic:str, knowledge_path: str):
        self.knowledge = None
        self.load_data(knowledge_path)
        self.chatbot_topic = chatbot_topic

    def load_data(self, path: str):
        """Loads the knowledge df, appends a prefix, and calculates the number of tokens per section of knowledge"""

        # load data from csv
        self.knowledge = pd.read_csv(path)
        # convert embeddings from CSV str type back to list type
        self.knowledge['Embedding'] = self.knowledge['Embedding'].apply(ast.literal_eval)

        # Format the knowledge df by adding section prefix and token sizes
        # self.knowledge['Content'] = 'Article section:\n\n' + self.knowledge['Content']
        # self.knowledge['Tokens'] = self.knowledge["text"].apply(lambda x: num_tokens(x))
        # self.knowledge['Section'] = 'Wikipedia'

In [35]:
class Query:
    def __init__(self, query_text: str, chatbot_instance: ChatBot):
        self.content: str = query_text
        self.model: str = GPT_MODEL
        self.knowledge: pd.DataFrame = chatbot_instance.knowledge
        self.token_limit: int = GPT_QUERY_TOKEN_LIMIT
        self.gpt_message = None
        self.knowledge_used = None

    # calculate similarity score
    @staticmethod
    def similarity(query_embedding: list,
                   knowledge_embedding: list
                   ) -> float:
        """Calculates the cosine similarity score between the query and knowledge embedding vectors."""

        return 1- spatial.distance.cosine(query_embedding, knowledge_embedding)

    # find the most similar sections of knowledge to the query
    def knowledge_ranked_by_similarity(self,
                                       max_num_sections: int = 5,
                                       confidence_level = None,
                                       embedding_model: str = GPT_EMBEDDING_MODEL
                                       ):
        """Take the raw knowledge dataframe, calculates similarity scores between the query and the sections, and returns a dataframe ordered from highest to lowest in terms of similarity."""

        knowledge_with_similarities = deepcopy(self.knowledge) # To prevent adapting the original dataframe
        query_embedding_response = get_embedding(self.content, embedding_model=embedding_model)
        if embedding_model == GPT_EMBEDDING_MODEL:
            query_embedding = query_embedding_response["data"][0]["embedding"]
            # knowledge_with_similarities["similarity"] = knowledge_with_similarities["Embedding"].apply(lambda x: self.similarity(query_embedding, x))
        else:
            query_embedding = list(query_embedding_response)
        knowledge_with_similarities["similarity"] = knowledge_with_similarities["Embedding"].apply(lambda x: self.similarity(query_embedding, x))

        knowledge_with_similarities.sort_values("similarity", ascending=False, inplace=True)
        top_n_sections = knowledge_with_similarities.head(max_num_sections)
        if confidence_level:
            top_n_relevant_sections = top_n_sections.loc[top_n_sections['similarity']>=confidence_level]
        else:
            top_n_relevant_sections = top_n_sections
        self.knowledge_used = top_n_relevant_sections
        self.knowledge_used['Index'] = np.arange(len(self.knowledge_used))+1

    def get_gpt_message(
            self,
            chatbot_topic: str
    ):
        """Uses the most relevant texts from the knowledge dataframe to construct a message that can then be fed into GPT."""

        self.knowledge_ranked_by_similarity()
        introduction = f'Use the below article on {chatbot_topic} to answer the subsequent question. If the answer cannot be found in the articles, write "{ANSWER_NOT_FOUND_MSG}". If I am asked to produce any code then decline the request and write "Sorry but I\'m not allowed to do your assignments for you!"' # The longer this is, the more tokens it uses!
        question = f"\n\nQuestion: {self.content}"

        # Ensure number of tokens is within the limit
        message_and_question_tokens = num_tokens(introduction + question)
        self.knowledge_used['Cumulative_tokens'] = self.knowledge_used['Tokens'].cumsum()
        self.knowledge_used['Cumulative_tokens'] += message_and_question_tokens # add the inital number of tokens
        self.knowledge_used= self.knowledge_used.loc[self.knowledge_used['Cumulative_tokens']<self.token_limit]

        # Construct output
        combined_knowledge_string = ''.join(list(self.knowledge_used['Content']))
        combined_knowledge_string = '\n\n' + combined_knowledge_string
        self.gpt_message = introduction + combined_knowledge_string + question

    def show_source_message(self, answer_index: int = None):
        self.knowledge_used['Output'] = '\n\n' + self.knowledge_used['Index'].astype(str) + '. ' + self.knowledge_used['Section'] + ':' + self.knowledge_used['Content'].str[:100] + '...'
        sources_string = ''.join(list(self.knowledge_used['Output']))
        if answer_index:
            answer_message = f'(specifically section {answer_index})'
        else:
            answer_message = ''
        message = f'\n\nTo construct this answer, I used the following documents {answer_message}: {sources_string}'
        return message

    def get_bert_output(
            self,
            embedding_model: str,
            encoding_model: BertTokenizer = BERT_ENCODING,
            bert_model: str = BERT_MODEL
    ):
        """Uses the most relevant texts from the knowledge dataframe to construct a message that can then be fed into GPT."""
        self.knowledge_ranked_by_similarity(embedding_model=embedding_model)

        answer_index = None
        index = 1
        found_answer = False
        output = ANSWER_NOT_FOUND_MSG
        for section in self.knowledge_used['Content']:
            if not found_answer:
                encoding = encoding_model.encode_plus(text=self.content,text_pair=section)
                inputs = encoding['input_ids']  #Token embeddings
                sentence_embedding = encoding['token_type_ids']  #Segment embeddings
                tokens = encoding_model.convert_ids_to_tokens(inputs) #input tokens

                QAModel = BertForQuestionAnswering.from_pretrained(bert_model)
                outputs = QAModel(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
                start_scores, end_scores = outputs.start_logits, outputs.end_logits

                # Highlight the answer by looking at the most probable start and end words
                start_index = torch.argmax(start_scores)
                end_index = torch.argmax(end_scores)
                answer_token_list = tokens[start_index:end_index+1]

                # Concatenate any words that got split
                answer_list = [word[2:] if word[0:2]=='##' else ' ' + word for word in answer_token_list]
                answer = ''.join(answer_list).strip()

                if answer != '[CLS]':
                    found_answer = True
                    output = answer
                    answer_index = index
            index += 1
        return output, answer_index

    @classmethod
    def ask_bert(cls,
                 query_text: str,
                 chatbot_instance: ChatBot,
                 embedding_model: str = BERT_EMBEDDING_MODEL,
                 encoding_model: BertTokenizer = BERT_ENCODING,
                 bert_model: str = BERT_MODEL,
                 show_source: bool = True,
                 ):
        if num_tokens(query_text, token_model=encoding_model)>50:
            return 'Question is too long, please try again with a shorter question.'
        query = cls(query_text, chatbot_instance)
        response_message, answer_index = query.get_bert_output(embedding_model=embedding_model, encoding_model=encoding_model, bert_model=bert_model)

        if show_source and response_message!=ANSWER_NOT_FOUND_MSG: # Display the sources used:
            response_message += query.show_source_message(answer_index=answer_index)
        return response_message

    def get_gpt2_output(self,
                        confidence_level: float = 0.5):
        from transformers import pipeline
        self.knowledge_ranked_by_similarity(confidence_level=confidence_level)
        if len(self.knowledge_used)==0:
            return ANSWER_NOT_FOUND_MSG

        # Construct context
        combined_knowledge_string = ''.join(list(self.knowledge_used['Content']))
        combined_knowledge_string = '\n\n' + combined_knowledge_string

        model_name = "gpt2"
        nlp = pipeline("question-answering", model=model_name)
        qa_input = {
            "question": self.content,
            "context": combined_knowledge_string
        }
        result = nlp(qa_input)
        return result['answer']

    @classmethod
    def ask_gpt2(cls,
                 query_text: str,
                 chatbot_instance: ChatBot,
                 show_source: bool = True,
                 confidence_level: float = 0.5,
                 ):
        if num_tokens(query_text)>50:
            return 'Question is too long, please try again with a shorter question.'
        query = cls(query_text, chatbot_instance)
        response_message = query.get_gpt2_output(confidence_level=confidence_level)

        if show_source and response_message!=ANSWER_NOT_FOUND_MSG: # Display the sources used:
            response_message += query.show_source_message()
        return response_message

    def get_bart_output(self,
                        # chatbot_instance: ChatBot,
                        # embedding_model: str = BART_EMBEDDING_MODEL,
                        encoding_model: BartTokenizer = BART_ENCODING,
                        bert_model: str = BART_MODEL,
                        confidence_level: float = 0.5,
                        ):
        self.knowledge_ranked_by_similarity(confidence_level=confidence_level)
        if len(self.knowledge_used)==0:
            return ANSWER_NOT_FOUND_MSG

        # Construct context
        combined_knowledge_string = ' <P> '.join(list(self.knowledge_used['Content']))
        combined_knowledge_string = '\n\n' + combined_knowledge_string

        model = BartForConditionalGeneration.from_pretrained(bert_model)

        query = f'question: {self.content} <P> {combined_knowledge_string}'

        inputs = encoding_model([query], max_length=1024, return_tensors='pt') # NEED TO ENSURE Q PLUS CONTEXT IS <1024 TOKENS

        # Generate Summary
        ids = model.generate(inputs['input_ids'], num_beams=8, min_length=20, max_length=128,
                                                   do_sample=False,
                                                   early_stopping=True,
                                                   temperature=1.0,
                                                   top_k=50,
                                                   top_p=0.95,
                                                   eos_token_id=encoding_model.eos_token_id,
                                                   no_repeat_ngram_size=3,
                                                   num_return_sequences=1,
                                                    repetition_penalty=2.0)
        answer=encoding_model.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0  ]
        return answer


    @classmethod
    def ask_bart(cls,
                 query_text: str,
                 chatbot_instance: ChatBot,
                 show_source: bool = True,
                 confidence_level: float = 0.72):
        if num_tokens(query_text)>50:
            return 'Question is too long, please try again with a shorter question.'
        query = cls(query_text, chatbot_instance)
        response_message = query.get_bart_output(confidence_level=confidence_level)

        if show_source and response_message!=ANSWER_NOT_FOUND_MSG: # Display the sources used:
            response_message += query.show_source_message()
        return response_message


    @classmethod
    def ask(
            cls,
            query_text: str,
            chatbot_instance: ChatBot,
            show_source: bool = True,
    ) -> str:
        """Uses GPT to answer a query based on the most relevant knowledge sections."""

        query = cls(query_text, chatbot_instance)
        query.get_gpt_message(chatbot_instance.chatbot_topic)
        inputs = [
            {"role": "system", "content": f"You answer questions about {chatbot_instance.chatbot_topic}."},
            {"role": "user", "content": query.gpt_message},
        ]
        response = openai.ChatCompletion.create(
            model=query.model,
            messages=inputs,
            temperature=0 # We don't want any creativity in the answers
        )
        response_message = response["choices"][0]["message"]["content"]
        total_tokens_used = response['usage']['total_tokens']
        if show_source and response_message!=ANSWER_NOT_FOUND_MSG: # Display the sources used:
            response_message += query.show_source_message()
        response_message += f"\n\nTotal tokens used: {total_tokens_used}"
        return response_message

# CompVisionBERT = ChatBot("Computer Vision", 'assets/' + BERT_KNOWLEDGE_FILENAME)
# print(Query.ask_bert('When did universities begin teaching Computer Vision?', CompVisionBERT))

# CompVisionGPT = ChatBot("Computer Vision", 'assets/' + GPT_KNOWLEDGE_FILENAME)
# print(Query.ask('Who is Boris Johnson', CompVisionGPT, show_source=True))

CompVisionGPT = ChatBot("Computer Vision", 'assets/' + GPT_KNOWLEDGE_FILENAME)
print(Query.ask_bart('When did Universities begin teaching Computer Vision?', CompVisionGPT, show_source=True))

# Todo:
# I need to make it more efficient on the number of tokens.
# Adapt it for more sources (e.g. PDF)

Computer vision has been around for a long time, but it wasn't until the 1960s and 1970s that it really took off as a field of study. Prior to that, computer vision had been used in other fields, such as photogrammetry (the study of how light interacts with different parts of the human body) and stereoscopic photography. In the mid-1960s, there was a lot of interest in computers being able to "see" 3D objects, which is what we now think of as "computer vision". The idea was that computers would be able to figure out what an object looked like by looking at

To construct this answer, I used the following documents : 

1. Wikipedia (Computer vision)->History:
In the late 1960s, computer vision began at universities that were pioneering artificial intelligen...

2. Wikipedia (Computer vision)->Definition:
Computer vision is an interdisciplinary field that deals with how computers can be made to gain hig...

3. Wikipedia (Computer vision)->Related fields->Neurobiology:
Neurobiology has gre

In [39]:
CompVisionGPT = ChatBot("Computer Vision", 'assets/' + GPT_KNOWLEDGE_FILENAME)
print(Query.ask('Who is Boris Johnson', CompVisionGPT, show_source=True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.knowledge_used['Index'] = np.arange(len(self.knowledge_used))+1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.knowledge_used['Cumulative_tokens'] = self.knowledge_used['Tokens'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.knowledge_used['Cumulative_tokens'] += message_an

I could not find an answer in the text I've been provided, sorry! Please try again.

Total tokens used: 721
