# GPT user support

> Semantic search enabled via GPT and context-specific responses

In [None]:
import numpy as np
import openai
from openai import OpenAI
import os
import pandas as pd
import pickle

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

# Authenticate with OpenAI API
with open('apiKeys.txt', 'r') as temp:
    apiKey = temp.read()
client = OpenAI(api_key=apiKey)

In [None]:
def gpt4(question, tokens=500):
    messages=[{"role": "user", "content": question}]

    response = client.chat.completions.create(model="gpt-4",
                                                max_tokens=tokens,
                                                temperature=0,
                                                messages=messages)

    # Extract the content
    content = response.choices[0].message.content

    # Split the content into text and code
    text_parts = []
    code_parts = []
    in_code_block = False

    for line in content.split("\n"):
        if line.startswith("```"):
            in_code_block = not in_code_block
            continue
        if in_code_block:
            code_parts.append(line)
        else:
            text_parts.append(line)

    # Print the text parts
    for line in text_parts:
        print(line)

    # Print a separator
    print("\n" + "-"*50 + "\n")

    # Print the code parts
    for line in code_parts:
        print(line)

## GPT Hallucination (lying)

In [None]:
prompt = "How to generate a token using Tapipy"

gpt4(prompt,300)

Ther is no website called "Tapipy" to create an account...all these are wrong!

## Forcing GPT to not lie!

In [None]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: How to generate a token using Tapipy?
A:
"""

gpt4(prompt,300)

Well....that was very helpful!

## Providing Context to GPT

> What if we could provide GPT with some context so it can provide useful help!

In [None]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Context: 
Create an Tapis Client Object

The first step in using the Tapis Python SDK, tapipy, is to create a Tapis Client object. First, import the Tapis class and create python object called t that points to the Tapis server using your TACC username and password. Do so by typing the following in a Python shell:

# Import the Tapis object
from tapipy.tapis import Tapis

# Log into you the Tapis service by providing user/pass and url.
t = Tapis(base_url='https://tacc.tapis.io',
          username='your username',
          password='your password')

Generate a Token

With the t object instantiated, we can exchange our credentials for an access token. In Tapis, you never send your username and password directly to the services; instead, you pass an access token which is cryptographically signed by the OAuth server and includes information about your identity. The Tapis services use this token to determine who you are and what you can do.

    # Get tokens that will be used for authenticated function calls
    t.get_tokens()
    print(t.access_token.access_token)

    Out[1]: eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9...

Note that the tapipy t object will store and pass your access token for you, so you don’t have to manually provide the token when using the tapipy operations. You are now ready to check your access to the Tapis APIs. It will expire though, after 4 hours, at which time you will need to generate a new token. If you are interested, you can create an OAuth client (a one-time setup step, like creating a TACC account) that can be used to generate access and refresh tokens. For simplicity, we are skipping that but if you are interested, check out the Tenancy and Authentication section.
Q: How to generate a token using Tapipy?
A:
"""

gpt4(prompt)

### (1) Create a word embedding as vector

In [None]:
import markdown2
from bs4 import BeautifulSoup
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

context = 'actor'

# Open the markdown file
with open(os.path.join(context + '.md'), "r") as file:
    content = file.read()

# Use markdown2 to convert the markdown file to html
html = markdown2.markdown(content)

# Use BeautifulSoup to parse the html
soup = BeautifulSoup(html, "html.parser")

# Initialize variables to store heading, subheading, and corresponding paragraphs
headings = []
paragraphs = []

data = []

MAX_WORDS = 500

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

# Iterate through the tags in the soup
for tag in soup.descendants:
    # Check if the tag is a heading
    if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        # When the next heading is encountered, print the heading, subheading, and corresponding paragraphs
        if headings and paragraphs:
            hdgs = " ".join(headings)
            para = " ".join(paragraphs)
            data.append([hdgs, para, count_tokens(para)])
            headings = []
            paragraphs = []
        # Add to heading
        headings.append(tag.text)
    # Check if the tag is a paragraph
    elif tag.name == "p":
        paragraphs.append(tag.text)

We create a dataset and filter out any sections with fewer than 40 tokens, as those are unlikely to contain enough context to ask a good question.

In [None]:
df = pd.DataFrame(data, columns=["heading", "content", "tokens"])
df = df[df.tokens>40]
df = df.reset_index().drop('index',axis=1) # reset index
df.head()

In [None]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = client.embeddings.create(model=model,
                                        input=text).data[0].embedding
    return result


def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

### Word embedding as vectors

In [None]:
vector_embedding = compute_doc_embeddings(df)

In [None]:
df['vector_embedding'] = pd.Series(vector_embedding)
df.head()

Stash the dataframe so that in the future we can just load it without having to revectorize it. This will save some time.

In [None]:
df.to_csv(os.path.join('vectorizedDataFrames', context))

### (2) Find the most similar document embeddings to the question embedding

We embed the query strip and use it to find the most similar document sections. Since this is a small example, we store and search the embeddings locally. 

In [None]:
from scipy.spatial.distance import cosine

def order_documents_query_similarity(data:pd.DataFrame, query_str:str, nres=3):
    embedding = get_embedding(query_str, model=EMBEDDING_MODEL)
    data['similarities'] = data.vector_embedding.apply(lambda x: 1-cosine(x, embedding))

    res = data.sort_values('similarities', ascending=False).head(nres)
    return res

We can see that the most relevant document sections for the token is listed at the top

In [None]:
res = order_documents_query_similarity(df, "How to generate a token using Tapipy")
res.head()

### (3) Add the most relevant document sections to the query prompt

In [64]:
question =  "How to generate a token using Tapipy"

In [65]:
def construct_prompt(question: str, df: pd.DataFrame, ncontents = 3) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_documents_query_similarity(df, question)
    
    chosen_sections = []
    chosen_section_len = 0

    MAX_SECTION_LEN = 500
    context = order_documents_query_similarity(df, question)
    context.head()

    for _, ctx in context.iterrows():
        chosen_section_len += ctx.tokens
        if chosen_section_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(" " + ctx.content.replace("\n", " "))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



In [None]:
construct_prompt(question="How to generate a token using Tapipy", df=df)

### (4) Answer the user's question based on the context.



In [67]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    show_prompt: bool = False) -> str:
    
    prompt = construct_prompt(
        query,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = client.completions.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response.choices[0].text.strip(" \n")

### Original GPT without context - telling lies as it invents a new Tapipy website and App to generate a token

In [None]:
prompt = "How to generate a token using Tapipy"
# ["choices"][0]["text"].strip(" \n")
client.completions.create(prompt=prompt, temperature=0, max_tokens=300, model=COMPLETIONS_MODEL).choices[0].text.strip(" \n")

### When you ask a question for which it can find a context! - It answers correctly!

In [None]:
answer_query_with_context("How to generate a token using Tapipy", df)

### When it doesn't know...at least it is honest!

In [None]:
answer_query_with_context("How to access files using Tapipy", df)