In [None]:
! pip install pgvector psycopg-binary psycopg fastembed

In [None]:
from pgvector.psycopg import register_vector
import psycopg
import os
import sys
import csv
from fastembed import TextEmbedding
from typing import List
import numpy as np

In [None]:
conn = psycopg.connect(
    dbname=os.environ.get("POSTGRESQL_DB"),
    host=os.environ.get("POSTGRESQL_HOST"),
    user=os.environ.get("POSTGRESQL_USER"),
    password=os.environ.get("POSTGRESQL_USER_PASSWORD"),
    autocommit=True)

In [None]:
conn.execute('CREATE EXTENSION IF NOT EXISTS vector;')
register_vector(conn)
conn.execute('DROP TABLE IF EXISTS documents;')
conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, author text, title text, description text, embedding vector(384));')

In [None]:
books = [*csv.DictReader(open('dataset.csv'))]

In [None]:
descriptions = [doc["description"] for doc in books]
embedding_model = TextEmbedding(model_name="BAAI/bge-small-en")
embeddings: List[np.ndarray] = list(embedding_model.embed(descriptions))

In [None]:
for i, doc in enumerate(books):
    conn.execute('INSERT INTO documents (author, title, description, embedding) VALUES (%s, %s, %s, %s)', (doc["author"], doc["title"], doc["description"], embeddings[i]))

Query the Postgres database. It runs a search query about `drama about people and unhappy love` and displays results.

This query performs a semantic search against the documents table in Postgres, retrieving a maximum of two results with highest match score relevant to your query text.
It prints each result separated by a line of dashes, in the following format:

- Title: Title of the book, Author: Author of the book
- Book description as stored in your document's description row.

In [None]:
query_vector = list(embedding_model.embed(["drama about people and unhappy love"]))[0]
response = conn.execute('SELECT title, author, description FROM documents ORDER BY embedding <-> %s LIMIT 2', (query_vector,)).fetchall()
for hit in response:
    print("Title: {}, Author: {}".format(hit[0], hit[1]))
    print(hit[2])
    print("---------")