In [1]:
%%capture
# !uv pip install -r ../requirements.txt # if using uv
!pip install -r ../requirements.txt 

In [2]:
# kill all connections to the database before running this notebook

import psycopg
import os 

with psycopg.connect(dbname='postgres', user=os.getenv('POSTGRES_USER'), password=os.getenv('POSTGRES_PASSWORD'), host='localhost') as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        cur.execute("""
            SELECT pg_terminate_backend(pid)
            FROM pg_stat_activity
            WHERE datname = 'test_db'
              AND pid <> pg_backend_pid();
        """)


In [3]:
import os
os.chdir('../..') # to be able to import 

import numpy as np
from sentence_transformers import SentenceTransformer


import pandas as pd

from db_utils import (create_db, create_embeddings_table,
                      create_pgvector_extension, delete_db,
                      insert_data_into_table, pg_connection)
from embed import HFModels
from retrieval import semantic

In [4]:
def generate_encodings(
        sentences: list, 
        model: SentenceTransformer = HFModels.default.value,
        save_to_file: bool = True, 
        filename: str = 'data/example_embeddings.npy'
        ) -> np.ndarray:
    
    try:
        embeddings = np.load(filename)
        print(f"Loaded embeddings from '{filename}'")
        return embeddings
    except FileNotFoundError:
        print(f"File '{filename}' not found. Generating embeddings...")

    model: SentenceTransformer = SentenceTransformer(HFModels.default.value)
    embeddings: np.ndarray = model.encode(sentences=sentences) # shape: (len(sentences), 384)
    if save_to_file: np.save('example_embeddings.npy', embeddings)
        
    return embeddings

In [5]:
# create the database and embeddings table
db_name = 'test_db'
delete_db(db_name) # delete if exists
create_db(db_name=db_name)
create_pgvector_extension(db_name)
create_embeddings_table(db_name)
CONN = pg_connection(db_name)
tb_name = 'pg_embeddings_test'
pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)

INFO:root:Database test_db deleted
INFO:root:Database 'test_db' created successfully
INFO:root:pgvector extension created
INFO:root:Embeddings table 'pg_embeddings_test' created.
  pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)


Unnamed: 0,id,chunk,embedding


In [6]:
sentences = ["I'm a physicist and a Data Scientist", "I don't linke the Copenhagen interpretation"]
embeddings: np.ndarray = generate_encodings(sentences)
embeddings = embeddings.tolist()
insert_data_into_table(db_name, sentences, embeddings, tb_name)
pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)

INFO:root:Data inserted into table 'pg_embeddings_test'. Failed chunks: 0


Loaded embeddings from 'data/example_embeddings.npy'


  pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)


Unnamed: 0,id,chunk,embedding
0,07b101d8-108c-47e3-92fa-98b2a3a4d6a7,I'm a physicist and a Data Scientist,"[-0.048952606,-0.057101876,0.028381784,0.09913..."
1,15b34629-043c-4a46-805e-bf85d12a179a,I don't linke the Copenhagen interpretation,"[-0.0031696414,0.07755055,0.009189781,0.029925..."


In [7]:
# example of retrieval

query = 'copenhagen'
res = semantic(query, 'test_db', tb_name)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
for i, r in enumerate(res, start=1):
    print(f"result {i}: {r}")

result 1: ("I don't linke the Copenhagen interpretation", 0.35559275084625686)
result 2: ("I'm a physicist and a Data Scientist", 0.8650325387716256)


In [9]:
CONN.close()

In [10]:
## comment out if you want to delete the database
# delete_db(db_name)