In [1]:
import requests
import numpy as np
import pandas as pd
from nomic import embed
from io import StringIO
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# get sentences to embed
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
data = pd.read_csv(StringIO(res.text), sep='\t')
sentences = data['sentence_A'].tolist()
s_B = data['sentence_B'].tolist()
sentences.extend(s_B)
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [3]:
matryoshka_dim = 512

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
embeddings = model.encode(sentences, convert_to_tensor=True)
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, :matryoshka_dim]
embeddings = F.normalize(embeddings, p=2, dim=1)
print(embeddings)

<All keys matched successfully>


tensor([[ 0.0151, -0.0018, -0.1929,  ...,  0.0677, -0.0241, -0.0785],
        [ 0.0769,  0.0335, -0.1990,  ...,  0.0183, -0.0697, -0.0217],
        [ 0.0484,  0.0420, -0.1929,  ..., -0.0279, -0.0099, -0.0255],
        ...,
        [ 0.0265,  0.0476, -0.2320,  ...,  0.0142, -0.0158,  0.0025],
        [-0.0165,  0.0853, -0.2448,  ..., -0.0296,  0.0234, -0.0337],
        [ 0.0349,  0.1102, -0.1354,  ...,  0.0139,  0.0238, -0.0248]],
       device='mps:0')


In [4]:
embeddings.shape

torch.Size([14504, 512])

In [5]:
embeddings[0][:10]

tensor([ 0.0151, -0.0018, -0.1929,  0.0800,  0.0008, -0.0029, -0.0320,  0.0174,
         0.0045,  0.0313], device='mps:0')

In [18]:
from typing import Any
from typing import List
from typing import Optional
from sqlalchemy import text
from sqlalchemy import Index
from sqlalchemy import Column
from sqlalchemy import String
from sqlalchemy import Integer
from sqlalchemy import ForeignKey
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.orm import relationship
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy_utils import database_exists, create_database
from pgvector.sqlalchemy import Vector
import numpy

# constants
DBAPI = "psycopg2"

class Base(DeclarativeBase):
    idx = mapped_column(Integer, primary_key=True)
    # pass

class Embedding(Base):
    __tablename__ = "embedding"
    vector = mapped_column(Vector(512))

class Sentence(Base):
    __tablename__ = "sentence"
    sentence = mapped_column(String)

pgvector_url = f"postgresql+{DBAPI}://postgres:password@0.0.0.0:5432/"

engine = create_engine(pgvector_url, echo=False)
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()

Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [20]:
with Session(engine) as session:
    
    counter = 0
    for vector in embeddings:

        embedding = Embedding(
            idx=counter,
            vector=vector.cpu().detach().numpy())
        sentence = Sentence(
            idx=counter,
            sentence=sentences[counter])

        session.add(embedding)
        session.add(sentence)
        session.commit()
        counter+=1

In [21]:
index = 10000
with Session(engine) as session:
    embedding = session.execute(text(f"SELECT * from embedding WHERE idx={index}"))
    sentence = session.execute(text(f"SELECT * from sentence WHERE idx={index}"))
print(embedding.fetchall())
print(sentence.fetchall())

[(10000, '[-0.042250562,0.013554859,-0.2248957,-0.072864436,0.024702221,0.07813704,-0.036870662,-0.06482345,-0.06289808,-0.037916675,-0.031647258,0.02446297,-0 ... (6007 characters truncated) ... 0.038954,0.032870747,-0.02065682,-0.0016474384,0.055520337,-0.010037745,-0.04831924,-0.07428066,-0.00042063626,0.06300488,-0.085648544,-0.0082111275]')]
[(10000, 'An older dog and a younger one playing with a toy.')]


In [23]:
# create an index
# by default, pgvector uses exact nearest neighbor search
# which provides perfect recall
from sqlalchemy import Index

# from pgvector-python
# an HNSW is an approximate nearest neighbor search
# so you trade some recall accuracy for in return for speed
INDEX = 'hnsw' # heirarchical navigable small world
HNSW_m = 16 # max number of connections per layer (default 16)
HNSW_efc = 64 # the size of the dynamic candidate list for graph construction
# a higher efc provides better recall in exchange for build time / insert speed

# other option: ivfflat = inverted flat file
# note if using ivfflat, postgresql_with={'lists':n}
# the hnsw has better query performance than an ivfflat
# but has longer build times and higher resource demand

VECTOR_OPS = 'vector_l2_ops' # l2 norm / euclidean distance
# see numpy.linalg.norm

# other options: vector_ip_ops = inner product, vector_cosine_ops = cosine distance

index = Index(
    'index',
    Embedding.vector,
    postgresql_using='hnsw',
    postgresql_with={'m':HNSW_m, 'ef_construction':HNSW_efc},
    postgresql_ops={'vector':VECTOR_OPS}
)
index.create(engine)

In [34]:
with Session(engine) as session:
    print(session.execute(text(f"SELECT COUNT(*) from embedding")).all())
    print(session.execute(text(f"SELECT COUNT(*) from sentence")).all())

[(14504,)]
[(14504,)]
