In [1]:
%%capture
%pip install --upgrade jupyter ipywidgets # due to warning: 
#'TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. 
# See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm'

In [2]:
import os

import numpy as np
from sentence_transformers import SentenceTransformer

os.chdir('..')
import pandas as pd

from db_utils import (create_db, create_embeddings_table,
                      create_pgvector_extension, delete_db,
                      insert_data_into_table, pg_connection)
from embed import HFModels
from retrieval import retrieve_from_pgvector

In [3]:
def generate_encodings(
        sentences: list, 
        model: SentenceTransformer = HFModels.default.value,
        save_to_file: bool = True, 
        filename: str = 'example_embeddings.npy'
        ) -> np.ndarray:
    
    try:
        embeddings = np.load(filename)
        return embeddings
    except FileNotFoundError:
        print(f"File '{filename}' not found. Generating embeddings...")

    model: SentenceTransformer = SentenceTransformer(HFModels.default.value)
    embeddings: np.ndarray = model.encode(sentences=sentences) # shape: (len(sentences), 384)
    if save_to_file: np.save('example_embeddings.npy', embeddings)
        
    return embeddings

In [4]:
# create the database and embeddings table
db_name = 'test_db'
create_db(db_name=db_name)
create_pgvector_extension(db_name)
create_embeddings_table(db_name)
CONN = pg_connection(db_name)
tb_name = 'pg_embeddings_test'
pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)

INFO:root:Database 'test_db' created successfully
INFO:root:pgvector extension created
INFO:root:Embeddings table 'pg_embeddings_test' created.
  pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)


Unnamed: 0,id,chunk,embedding


In [5]:
sentences = ["I'm a physicist and a Data Scientist", "I don't linke the Copenhagen interpretation"]
embeddings: np.ndarray = generate_encodings(sentences)
embeddings = embeddings.tolist()
insert_data_into_table(db_name, sentences, embeddings, tb_name)
pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)

INFO:root:Data inserted into table 'pg_embeddings_test'. Failed chunks: 0
  pd.read_sql_query(f'SELECT * FROM {tb_name}', CONN)


Unnamed: 0,id,chunk,embedding
0,c298d6b0-fa37-438e-92c2-879ab879a6b7,I'm a physicist and a Data Scientist,"[-0.048952606,-0.057101876,0.028381784,0.09913..."
1,d9f20772-f798-4781-b379-30d0461a37de,I don't linke the Copenhagen interpretation,"[-0.0031696414,0.07755055,0.009189781,0.029925..."


In [6]:
# example of retrieval

query = 'copenhagen'
res = retrieve_from_pgvector(query, 'test_db', tb_name)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
for i, r in enumerate(res, start=1):
    print(f"result {i}: {r}")

result 1: ("I don't linke the Copenhagen interpretation", 0.35559275084625686)
result 2: ("I'm a physicist and a Data Scientist", 0.8650325387716256)


In [8]:
CONN.close()

In [9]:
delete_db(db_name)

INFO:root:Database test_db deleted
