In [6]:
%pip install -q sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
# select * from table where text like '%internet%'
# What’s the internet usage policy?
# Step 1: Define Sample Documents
# Document Corpus
documents = [
    {"doc_id": "1", "section": "Pay Policies", "content": "Employees are paid bi-weekly via direct deposit."},
    {"section": "Leave of Absence", "content": "Employees must submit a leave request for approval."},
    # query = "What’s the internet usage policy?"
    {"section": "Internet Use", "content": "Company internet must be used for work-related tasks only."},
    {"section": "Internet Use", "content": "Company internet is a broadband internet."},
    {"section": "Break at Work", "content": "Employees can take an hour break."},
    {"section": "Harassment", "content": "Interact with each employee with Respect"}
]

# Step 2: Get Content Texts
content_corpus = [doc["content"] for doc in documents]

content_corpus

['Employees are paid bi-weekly via direct deposit.',
 'Employees must submit a leave request for approval.',
 'Company internet must be used for work-related tasks only.',
 'Company internet is a broadband internet.',
 'Employees can take an hour break.',
 'Interact with each employee with Respect']

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
doc_vectors = model.encode(content_corpus)

doc_vectors

  from .autonotebook import tqdm as notebook_tqdm


array([[ 0.02472511, -0.00908149,  0.03887128, ...,  0.01965649,
         0.0426001 , -0.02707138],
       [ 0.03315501,  0.04853382,  0.0473627 , ...,  0.10182007,
         0.09159277,  0.00358374],
       [-0.07135911, -0.03066472,  0.03183769, ..., -0.04109801,
         0.06524776, -0.00688535],
       [-0.00383738, -0.02336751,  0.02958678, ..., -0.04415286,
         0.12559086, -0.0313985 ],
       [-0.01790442,  0.01495847,  0.08163834, ..., -0.03217229,
        -0.00513649,  0.05279535],
       [-0.00240888,  0.03361149, -0.06162646, ...,  0.04830882,
         0.03707638, -0.01683049]], shape=(6, 384), dtype=float32)

In [3]:
# Step 3: User Query and Semantic Matching

query = "What's the internet usage policy?"
query_vec = model.encode([query])[0]
query_vec

array([ 1.05987890e-02, -4.19098400e-02, -2.66205557e-02, -4.54706289e-02,
       -3.23202912e-05,  1.56041514e-02,  1.17841855e-01, -3.23812664e-02,
        9.29475995e-04,  5.16253430e-03,  2.37195306e-02,  8.78969803e-02,
       -3.00894137e-02, -2.03393288e-02,  2.84307934e-02,  1.44434860e-02,
        1.61064249e-02, -7.58647621e-02, -2.63159163e-02, -3.53143699e-02,
        8.43807235e-02, -3.55804041e-02,  1.60689130e-02, -2.03598916e-04,
       -9.33342334e-03,  3.89167368e-02, -2.84239706e-02, -1.37804495e-03,
       -3.74972373e-02,  3.70447077e-02,  3.77253094e-03, -4.27399650e-02,
        3.53124412e-03, -4.11898866e-02, -7.60837942e-02, -1.03469610e-01,
       -9.11574438e-02, -1.55567715e-03, -2.76062135e-02,  3.15038748e-02,
        3.04190014e-02, -7.71862715e-02, -2.46530026e-03,  9.84697193e-02,
        5.89628331e-02,  2.19075438e-02,  3.95881338e-03,  1.52551150e-02,
       -1.16527511e-03,  3.27795781e-02,  1.06050506e-01,  3.64240259e-02,
        3.72969955e-02,  

In [4]:
similarities = model.similarity(query_vec, doc_vectors)

# Ensure it's a 1D numpy array
import numpy as np
similarities = np.asarray(similarities).squeeze()
similarities

array([0.12369897, 0.09525196, 0.4491283 , 0.47815692, 0.11286639,
       0.04245535], dtype=float32)

In [5]:
# Now get top 3
top_3_indices = np.argsort(similarities)[::-1][:3]
print(top_3_indices)
top_scores = similarities[top_3_indices]
top_scores

[3 2 0]


array([0.47815692, 0.4491283 , 0.12369897], dtype=float32)

In [6]:
top_docs = [documents[i]['content'] for i in top_3_indices]
# documents = [
#     {"section": "Pay Policies", "content": "Employees are paid bi-weekly via direct deposit."},
#     {"section": "Leave of Absence", "content": "Employees must submit a leave request for approval."},
#     {"section": "Internet Use", "content": "Company internet must be used for work-related tasks only."},
#     {"section": "Break at Work", "content": "Employees can take an hour break."},
#     {"section": "Harassment", "content": "Interact with each employee with Respect"}
# ]

print (top_docs)
context = ", ".join(top_docs)
context

['Company internet is a broadband internet.', 'Company internet must be used for work-related tasks only.', 'Employees are paid bi-weekly via direct deposit.']


'Company internet is a broadband internet., Company internet must be used for work-related tasks only., Employees are paid bi-weekly via direct deposit.'

In [7]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env.local")
my_api_key = os.getenv("OPENAI_API_KEY")

my_client = OpenAI(api_key=my_api_key)
# my_client

def ask_question_open_ai(prompt):

    # print(f"User asked: {prompt}")
    # my_client.chat.completions.create

    llm_response = my_client.chat.completions.create(
        model="gpt-5-nano",
        # messages=[
        #     {"role": "system", "content": "You are a helpful assistant. Answer as concisely as possible."},
        #     {"role": "user", "content": prompt}
        # ]
        messages=[
            {"role": "system", "content": '''
             You are an assistant who answers only based on the given context.
             '''},
            {"role": "user", "content": f"Context: {context}\n\n User Question: {query}"} 
        ]

    )
    return llm_response.choices[0].message.content  

In [8]:
print (query)
response = ask_question_open_ai(query)

What's the internet usage policy?


In [9]:
print(f"User query: {query}")
print(f"Context: {context}")

print(f"\n\nOpen AI Response: {response}")

User query: What's the internet usage policy?
Context: Company internet is a broadband internet., Company internet must be used for work-related tasks only., Employees are paid bi-weekly via direct deposit.


Open AI Response: - The company internet is broadband.
- It must be used for work-related tasks only.


In [11]:
#reference data
expected_answer = "The internet usage policy states that company internet must be used for work-related tasks only."
actual_response = response

#Call chatcompilations API to compare expected vs actual
#LLM as a judge

comparison_prompt = f"""
You are a strict evaluator. Compare the actual response to the expected answer.
Expected Answer: {expected_answer}
Actual Response: {actual_response}
Do they match in meaning? Answer with a simple Yes or No.
"""
comparison_response = my_client.chat.completions.create(
    model="gpt-5-nano",
    messages=[
        {"role": "system", "content": "You are a strict evaluator. Answer with Yes or No only with explanation."},
        {"role": "user", "content": comparison_prompt}
    ]
)

print(f"\n\nEvaluation Result: {comparison_response.choices[0].message.content}")



Evaluation Result: Yes — both convey that company internet usage should be for work-related tasks only; the actual response adds an extra, unrelated line about broadband but does not contradict the policy.
