In [1]:
import os
import json
from datasets import load_dataset
from time import time
from dotenv import load_dotenv
from rich.console import Console
from psycopg import sql


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('../pgml/')

from pgml import Database
from pgml.dbutils import run_select_statement


In [3]:
load_dotenv()
console = Console()

local_pgml = "postgres://postgres@127.0.0.1:5433/pgml"

conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
db = Database(conninfo)


In [4]:
collection_name = "squad_collection"
collection = db.create_or_get_collection(collection_name)


In [5]:
data = load_dataset("squad", split="train")
data = data.to_pandas()
data = data.drop_duplicates(subset=["context"])

documents = [
    {"id": r["id"], "text": r["context"], "title": r["title"]}
    for r in data.to_dict(orient="records")
]


In [6]:
collection.upsert_documents(documents[:200])
collection.generate_chunks()


In [7]:
collection.generate_embeddings()


In [8]:
start = time()
query = "Who won more than 20 grammy awards?"
results = collection.vector_search(query, top_k=5)
_end = time()


In [9]:
console.print("\nResults for '%s'" % (query), style="bold")
console.print(results)
console.print("Query time = %0.3f" % (_end - start))


In [10]:
# Get the context passage and use pgml.transform to get short answer to the question
conn = db.pool.getconn()
context = " ".join(results[0]["chunk"].strip().split())
context = context.replace('"', '\\"').replace("'", "''")


In [11]:
select_statement = """SELECT pgml.transform(
    'question-answering',
    inputs => ARRAY[
        '{
            \"question\": \"%s\",
            \"context\": \"%s\"
        }'
    ]
) AS answer;""" % (
    query,
    context,
)


In [12]:
results = run_select_statement(conn, select_statement)
db.pool.putconn(conn)


In [13]:
console.print("\nResults for query '%s'" % query)
console.print(results)


In [None]:
db.archive_collection(collection_name)
