In [1]:
import requests
import numpy as np
import pandas as pd
from nomic import embed
from io import StringIO
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# get sentences to embed
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
data = pd.read_csv(StringIO(res.text), sep='\t')
sentences = data['sentence_A'].tolist()
s_B = data['sentence_B'].tolist()
sentences.extend(s_B)
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [3]:
matryoshka_dim = 512

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
embeddings = model.encode(sentences, convert_to_tensor=True)
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, :matryoshka_dim]
embeddings = F.normalize(embeddings, p=2, dim=1)
print(embeddings)

<All keys matched successfully>


tensor([[ 0.0151, -0.0018, -0.1929,  ...,  0.0677, -0.0241, -0.0785],
        [ 0.0769,  0.0335, -0.1990,  ...,  0.0183, -0.0697, -0.0217],
        [ 0.0484,  0.0420, -0.1929,  ..., -0.0279, -0.0099, -0.0255],
        ...,
        [ 0.0265,  0.0476, -0.2320,  ...,  0.0142, -0.0158,  0.0025],
        [-0.0165,  0.0853, -0.2448,  ..., -0.0296,  0.0234, -0.0337],
        [ 0.0349,  0.1102, -0.1354,  ...,  0.0139,  0.0238, -0.0248]],
       device='mps:0')


In [4]:
embeddings.shape

torch.Size([14504, 512])

In [5]:
embeddings[0][:10]

tensor([ 0.0151, -0.0018, -0.1929,  0.0800,  0.0008, -0.0029, -0.0320,  0.0174,
         0.0045,  0.0313], device='mps:0')

In [6]:
from typing import Any
from typing import List
from typing import Optional
from sqlalchemy import text
from sqlalchemy import Column
from sqlalchemy import String
from sqlalchemy import Integer
from sqlalchemy import ForeignKey
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.orm import relationship
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy_utils import database_exists, create_database
from pgvector.sqlalchemy import Vector
import numpy

class Base(DeclarativeBase):
    pass

class Item(Base):
    __tablename__ = "embeddings"
    idx = mapped_column(Integer, primary_key=True)
    vector = mapped_column(Vector(512))
    sentence = mapped_column(String(255))

pgvector_url = "postgresql+psycopg2://postgres:password@0.0.0.0:5432/"

engine = create_engine(pgvector_url, echo=False)
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [7]:
with Session(engine) as session:
    counter = 0
    for vector in embeddings:
        item = Item(
            idx=counter,
            vector=vector.cpu().detach().numpy(),
            sentence=sentences[counter])
        session.add(item)
        session.commit()
        counter+=1
        if counter > 50:
            break

In [8]:
{'idx': 0, 'vector': '[-0.044082753360271454,0.14926517009735107,-0.21274015307426453,0.007219878025352955,0.027714928612113,0.020999938249588013,-0.041637275367975235,0.0 ... (10476 characters truncated) ... 77,-0.010919809341430664,-0.037141285836696625,-0.0684182345867157,0.008810687810182571,0.054586511105298996,0.05105145275592804,0.01631416380405426]', 'sentence': 'provide with a soundtrack or voiceover'}

{'idx': 0,
 'vector': '[-0.044082753360271454,0.14926517009735107,-0.21274015307426453,0.007219878025352955,0.027714928612113,0.020999938249588013,-0.041637275367975235,0.0 ... (10476 characters truncated) ... 77,-0.010919809341430664,-0.037141285836696625,-0.0684182345867157,0.008810687810182571,0.054586511105298996,0.05105145275592804,0.01631416380405426]',
 'sentence': 'provide with a soundtrack or voiceover'}

In [9]:
with Session(engine) as session:
    result = session.execute(text("SELECT * from embeddings WHERE idx=2"))

In [11]:
result.fetchall()

[(2, '[0.048366994,0.041952204,-0.19287825,-0.029902058,0.070920214,0.029000374,0.04912086,0.029282961,0.022841007,0.011761951,-0.012449452,0.039000433,0.0 ... (6023 characters truncated) ... 4,-0.028026879,0.034993317,0.01885938,0.01982503,0.055232923,-0.03998982,0.051346086,-0.07368281,-0.049254645,-0.027886733,-0.009930388,-0.025505267]', 'pass into by penetrating or permeating')]