In [2]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

openai.api_key = "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"
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

# We have hosted the processed dataset, so you can download it directly without having to recreate it.
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('./data/my_sections_text2.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")


def get_embedding(text: str, model: str = EMBEDDING_MODEL):
    result = openai.Embedding.create(
        model=model,
        input=text
    )
    return result["data"][0]["embedding"]


def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }


def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c)
                  for c in df.columns if c != "title" and c != "heading"])
    return {
        (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }


# document_embeddings = load_embeddings("./data/olympics_sections_document_embeddings.csv")

# ===== OR, uncomment the below line to recaculate the embeddings from scratch. ========

document_embeddings = compute_doc_embeddings(df)


# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(
    f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")


def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities


x = order_document_sections_by_query_similarity(
    "Who won the men's high jump?", document_embeddings)[:5]

print(x)


MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"



# prompt = construct_prompt(
#     "Who won the 2020 Summer Olympics men's high jump?",
#     document_embeddings,
#     df
# )

# print("===\n", prompt)


COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}


def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
        prompt=prompt,
        **COMPLETIONS_API_PARAMS
    )

    return response["choices"][0]["text"].strip(" \n")



80 rows in the data.
('it', 'อัพเกรด-ปากกายี้ห้อ?') : [-0.027002012357115746, 0.00606458680704236, 0.024204017594456673, -0.03425506874918938, -0.008258161135017872]... (1536 entries)
[(0.7411812717714277, ('it', 'อยู่ต่างจังหวัดมีส่งเครื่องให้ไหม/การเตรียมตัวรับเครื่อง?')), (0.7047203860382953, ('student_activity', 'สามารถติดตามข้อมูล ข่าวสาร สหกิจศึกษา ม.หอการค้าไทย ได้ที่ไหน')), (0.7015678454843883, ('student_activity', 'มีชมรมกีฬาอะไรบ้าง')), (0.7009822683697996, ('student_activity', 'ทุนกีฬาเปิดรับสมัครช่วงไหน')), (0.7007752094786921, ('student_activity', 'ช่วงไหนมีกิจกรรมกีฬาอะไรบ้าง และช่วงไหน'))]


In [11]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(
        question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]

        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(
            SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know." You must answer in very short and concise way. \n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [3]:
b = answer_query_with_context(
    "อยากติดตั้ง wifi อ่ะ", df, document_embeddings)
print(b)

Selected 4 document sections:
('it', 'ต้องการใช้ VPN')
('it', 'เข้าใช้งาน wifi ไม่ได้')
('ipad', 'อัพเกรด อัพสเปค ไอแพดได้ไหม')
('it', 'อัพเกรด-ปากกายี้ห้อ?')
คุณสามารถติดตั้ง wifi ได้ด้วยขั้นตอนที่กล่าวไว้ โดยเลือกใช้งาน UTCC-Mobile และกรอก Username และ Password ที่ได้รับจากมหาวิทยาลัย


In [25]:
b = answer_query_with_context(
    "password หาย", df, document_embeddings)
print(b)

Selected 4 document sections:
('it', 'เปลี่ยนรหัสแล้วเข้า mail ไม่ได้')
('it', 'Sign in ด้วย UTCC Email เข้า Webex Meetingไม่ได้\n\xa0\xa0กรณีที่ 1 ที่ยังไม่เคยลงทะเบียนสมัครเป็นสมาชิก\xa0\n\xa0\xa0กรณีที่ 2 ลืมรหัสผ่าน')
('it', 'ไม่สามารถ Login เข้าใช้งาน MS Teams')
('registrar', 'โปรโมชั่นสมัครเรียน')
ติดต่อสำนักบริการคอมพิวเตอร์


In [23]:
with open('document_embeddings.pickle', 'wb') as f:
    pickle.dump(document_embeddings, f)

In [24]:
# Load the pickled document_embeddings dictionary
with open('document_embeddings.pickle', 'rb') as f:
    document_embeddings_loaded = pickle.load(f)