In [5]:
import requests
import numpy as np
import pandas as pd
from nomic import embed
from io import StringIO
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [6]:
# get sentences to embed
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
data = pd.read_csv(StringIO(res.text), sep='\t')
sentences = data['sentence_A'].tolist()
s_B = data['sentence_B'].tolist()
sentences.extend(s_B)
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [7]:
matryoshka_dim = 512

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
embeddings = model.encode(sentences, convert_to_tensor=True)
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, :matryoshka_dim]
embeddings = F.normalize(embeddings, p=2, dim=1)
print(embeddings)

<All keys matched successfully>


tensor([[-0.0282, -0.0123, -0.1956,  ...,  0.1055, -0.0318,  0.0212],
        [-0.0441,  0.1493, -0.2127,  ...,  0.0546,  0.0511,  0.0163],
        [ 0.0299,  0.0479, -0.2159,  ...,  0.0689, -0.0230,  0.0076],
        ...,
        [ 0.0186,  0.0362, -0.2138,  ...,  0.0231, -0.0703, -0.0233],
        [ 0.0685,  0.0123, -0.1868,  ..., -0.0210, -0.0420,  0.0241],
        [ 0.0411,  0.0771, -0.1670,  ...,  0.0456, -0.0304, -0.0005]],
       device='mps:0')


In [8]:
embeddings.shape

torch.Size([14504, 512])

In [9]:
embeddings[0][:10]

tensor([-0.0282, -0.0123, -0.1956, -0.0263,  0.0146, -0.0223, -0.0361,  0.0322,
        -0.0229,  0.0241], device='mps:0')

In [17]:
from typing import Any
from typing import List
from typing import Optional
from sqlalchemy import text
from sqlalchemy import Column
from sqlalchemy import String
from sqlalchemy import Integer
from sqlalchemy import ForeignKey
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy.orm import relationship
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy_utils import database_exists, create_database
from pgvector.sqlalchemy import Vector
import numpy

class Base(DeclarativeBase):
    pass

class Item(Base):
    __tablename__ = "embeddings"
    idx = mapped_column(Integer, primary_key=True)
    vector = mapped_column(Vector(512))
    sentence = mapped_column(String(255))

pgvector_url = "postgresql+psycopg2://postgres:password@0.0.0.0:5432/"

engine = create_engine(pgvector_url, echo=True)
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

2024-07-16 01:08:12,524 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-07-16 01:08:12,525 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-07-16 01:08:12,526 INFO sqlalchemy.engine.Engine select current_schema()
2024-07-16 01:08:12,527 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-07-16 01:08:12,539 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-07-16 01:08:12,540 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-07-16 01:08:12,541 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:12,541 INFO sqlalchemy.engine.Engine CREATE EXTENSION IF NOT EXISTS vector
2024-07-16 01:08:12,542 INFO sqlalchemy.engine.Engine [generated in 0.00101s] {}
2024-07-16 01:08:12,543 INFO sqlalchemy.engine.Engine COMMIT
2024-07-16 01:08:12,544 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:12,547 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid =

In [None]:
with Session(engine) as session:
    counter = 0
    for vector in embeddings:
        item = Item(
            idx=counter,
            vector=vector.cpu().detach().numpy(),
            sentence=sentences[counter])
        session.add(item)
        session.commit()
        counter+=1

2024-07-16 01:08:16,142 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:16,144 INFO sqlalchemy.engine.Engine INSERT INTO embeddings (idx, vector, sentence) VALUES (%(idx)s, %(vector)s, %(sentence)s)
2024-07-16 01:08:16,144 INFO sqlalchemy.engine.Engine [generated in 0.00064s] {'idx': 0, 'vector': '[-0.028163576498627663,-0.012317497283220291,-0.19559697806835175,-0.026342296972870827,0.01462786365300417,-0.0223308727145195,-0.03606909140944481, ... (10491 characters truncated) ... 58,-0.021057704463601112,-0.0325763002038002,0.020463643595576286,-0.04858873039484024,0.10546409338712692,-0.03180122748017311,0.021155741065740585]', 'sentence': 'provide with a soundtrack or voiceover'}
2024-07-16 01:08:16,146 INFO sqlalchemy.engine.Engine COMMIT
2024-07-16 01:08:16,148 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:16,149 INFO sqlalchemy.engine.Engine INSERT INTO embeddings (idx, vector, sentence) VALUES (%(idx)s, %(vector)s, %(sentence)s)
2024-07-16 01:08

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



2024-07-16 01:08:19,684 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:19,685 INFO sqlalchemy.engine.Engine INSERT INTO embeddings (idx, vector, sentence) VALUES (%(idx)s, %(vector)s, %(sentence)s)
2024-07-16 01:08:19,685 INFO sqlalchemy.engine.Engine [cached since 3.542s ago] {'idx': 1442, 'vector': '[0.017806855961680412,0.058519382029771805,-0.1631082445383072,-0.10798762738704681,0.011559512466192245,0.04476289078593254,-0.028073711320757866,-0 ... (10478 characters truncated) ... ,-0.012780736200511456,-0.09682489186525345,-0.03908756375312805,-0.040364060550928116,0.09337405860424042,-0.0509970560669899,-0.019227247685194016]', 'sentence': 'Black and white photo of a girl on a sofa.'}
2024-07-16 01:08:19,685 INFO sqlalchemy.engine.Engine COMMIT
2024-07-16 01:08:19,686 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:08:19,686 INFO sqlalchemy.engine.Engine INSERT INTO embeddings (idx, vector, sentence) VALUES (%(idx)s, %(vector)s, %(sentence)s)
2024-07

In [12]:
{'idx': 0, 'vector': '[-0.044082753360271454,0.14926517009735107,-0.21274015307426453,0.007219878025352955,0.027714928612113,0.020999938249588013,-0.041637275367975235,0.0 ... (10476 characters truncated) ... 77,-0.010919809341430664,-0.037141285836696625,-0.0684182345867157,0.008810687810182571,0.054586511105298996,0.05105145275592804,0.01631416380405426]', 'sentence': 'provide with a soundtrack or voiceover'}

{'idx': 0,
 'vector': '[-0.044082753360271454,0.14926517009735107,-0.21274015307426453,0.007219878025352955,0.027714928612113,0.020999938249588013,-0.041637275367975235,0.0 ... (10476 characters truncated) ... 77,-0.010919809341430664,-0.037141285836696625,-0.0684182345867157,0.008810687810182571,0.054586511105298996,0.05105145275592804,0.01631416380405426]',
 'sentence': 'provide with a soundtrack or voiceover'}

In [15]:
with Session(engine) as session:
    session.execute(text("SELECT * from embeddings WHERE idx="))

2024-07-16 01:07:13,690 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-07-16 01:07:13,692 INFO sqlalchemy.engine.Engine SELECT * from embeddings
2024-07-16 01:07:13,692 INFO sqlalchemy.engine.Engine [generated in 0.00058s] {}
2024-07-16 01:07:13,694 INFO sqlalchemy.engine.Engine ROLLBACK
