In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
import numpy as np

# Load embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**inputs)
    embedding = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# PostgreSQL connection
conn = psycopg2.connect(
    dbname="cleaned_meta_data_db",
    user="rg5073",
    password="rg5073pass",
    host="129.114.27.112",  # or container IP
    port="5432"
)
cursor = conn.cursor()

# Fetch chunks
cursor.execute("SELECT paper_id, chunk_id, chunk_data FROM arxiv_chunks_backup;")
rows = cursor.fetchall()

for paper_id, chunk_id, text in rows:
    vector = get_embedding(text)
    vector_str = list(map(float, vector))
    cursor.execute(
        "UPDATE arxiv_chunks_backup SET chunk_embedding = %s WHERE paper_id = %s AND chunk_id = %s",
        (vector_str, paper_id, chunk_id)
    )

conn.commit()
cursor.close()
conn.close()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
import numpy as np
from tqdm.auto import tqdm  

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer   = AutoTokenizer.from_pretrained(model_name)
model       = AutoModel.from_pretrained(model_name)

def get_embedding(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [30]:
conn = psycopg2.connect(
    dbname="cleaned_meta_data_db",
    user="rg5073",
    password="rg5073pass",
    host="129.114.27.112",
    port="5432"
)
cursor = conn.cursor()


In [4]:
cursor.execute("SELECT COUNT(*) FROM arxiv_chunks_eval_4;")
total_rows = cursor.fetchone()[0]

In [5]:
pip install pgvector psycopg2-binary


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
from pgvector.psycopg2 import register_vector, Vector 

ImportError: cannot import name 'Vector' from 'pgvector.psycopg2' (/Users/riyagarg/Library/Python/3.9/lib/python/site-packages/pgvector/psycopg2/__init__.py)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
from pgvector.psycopg2 import register_vector
from tqdm.auto import tqdm
import numpy as np

# 1. Model setup
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModel.from_pretrained(model_name)

def get_embedding(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        emb = model(**inputs).last_hidden_state.mean(dim=1).squeeze()
    return emb.cpu().numpy().astype(np.float32)  # ensure float32

# 2. Connect & register pgvector
conn = psycopg2.connect(
    dbname="cleaned_meta_data_db",
    user="rg5073",
    password="rg5073pass",
    host="129.114.27.112",
    port="5432",
)
register_vector(conn)
cur = conn.cursor()

# 3. Load all rows into memory once
cur.execute("SELECT paper_id, chunk_id, chunk_data FROM arxiv_chunks_eval_4;")
rows = cur.fetchall()         # now rows is a Python list

# 4. Iterate with tqdm
for paper_id, chunk_id, text in tqdm(rows, desc="Embedding", unit="chunk"):
    vec = get_embedding(text)
    cur.execute(
        """
        UPDATE arxiv_chunks_eval_4
           SET chunk_embedding = %s
         WHERE paper_id = %s AND chunk_id = %s
        """,
        (vec, paper_id, chunk_id)
    )

conn.commit()
cur.close()
conn.close()


Embedding:   0%|          | 0/52554 [00:00<?, ?chunk/s]

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
from pgvector.psycopg2 import register_vector
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModel.from_pretrained(model_name)

def get_embedding(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        emb = model(**inputs).last_hidden_state.mean(dim=1).squeeze()
    return emb.cpu().numpy().astype(np.float32)  # ensure float32

In [3]:
conn = psycopg2.connect(
    dbname="cleaned_meta_data_db",
    user="rg5073",
    password="rg5073pass",
    host="129.114.27.112",
    port="5432",
)
register_vector(conn)
cur = conn.cursor()

In [4]:
cur.execute("SELECT paper_id, chunk_id, chunk_data FROM arxiv_chunks_eval_4;")
rows = cur.fetchall()   

In [5]:
print(rows[0])

('0704.2066v1', '0704.2066v1_1', 'arXiv 0704.2066v1 quant ph 17 Apr 2007 Lower bounds for communication capacities of two qudit unitary operations Dominic W. Berry Centre for Quantum Computer Technology, Macquarie University, Sydney, NSW 2109, Australia and Department of Physics, The University of Queensland, St. Lucia, Queensland 4072, Australia Dated December 8, 2013 We show that entangling capacities based on the Jamio lkowski isomorphism may be used to place lower bounds on the communication capacities of arbitrary bipartite unitaries. Therefore, for these definitions, the relations which have been previously shown for two qubit unitaries also hold for arbitrary dimensions. These results are closely related to the theory of the entanglement assisted capacity of channels. We also present more general methods for producing ensembles for communication from initial states for entanglement creation. PACS numbers 03.67.Mn, 03.67.Hk I. INTRODUCTION In quantum information processing, one o

In [10]:
count = 0

for paper_id, chunk_id, text in tqdm(rows, desc="Embedding", unit="chunk"):
    if count < 3:
        print("text:", text)
    # vec = get_embedding(text)
    vec = get_embedding(text).tolist()
    # print("\n\n vec len",len(vec))
    if count < 5:
        print("vec:", vec, paper_id, chunk_id)
    cur.execute(
        """
        UPDATE arxiv_chunks_eval_4
           SET chunk_embedding = %s
         WHERE paper_id = %s AND chunk_id = %s
        """,
        (vec, paper_id, chunk_id)
    )
    count+=1

conn.commit()
cur.close()
conn.close()

Embedding:   0%|          | 1/52554 [00:00<2:03:09,  7.11chunk/s]

text: arXiv 0704.2066v1 quant ph 17 Apr 2007 Lower bounds for communication capacities of two qudit unitary operations Dominic W. Berry Centre for Quantum Computer Technology, Macquarie University, Sydney, NSW 2109, Australia and Department of Physics, The University of Queensland, St. Lucia, Queensland 4072, Australia Dated December 8, 2013 We show that entangling capacities based on the Jamio lkowski isomorphism may be used to place lower bounds on the communication capacities of arbitrary bipartite unitaries. Therefore, for these definitions, the relations which have been previously shown for two qubit unitaries also hold for arbitrary dimensions. These results are closely related to the theory of the entanglement assisted capacity of channels. We also present more general methods for producing ensembles for communication from initial states for entanglement creation. PACS numbers 03.67.Mn, 03.67.Hk I. INTRODUCTION In quantum information processing, one of the most important tasks i

Embedding:   0%|          | 3/52554 [00:00<1:42:29,  8.55chunk/s]

text: equal to that of Alice s component of AB. The dimensions of HA1 U and HB1 U are the same as HAU and HBU for AB that is, they are the subsystems that U acts nontrivially upon . The operation U0 is taken to swap the states of the subsystems HAU and HBU,3, giving Aanc,B3 U AU ,B1 U ,B2 U . Because U only acts nontrivially on HA1 U HB1 U , the change in entanglement under U is EU o . As we may take o to be arbitrarily small, E , . All communication capacitites considered here are un altered if larger dimensions are permitted for HAU and HBU . Therefore, any relations that can be proven for E , U also hold for EU. The only other capacity which we consider in this work that may depend on the dimen sions of HAU and HBU is E U . However, in this case it does not appear that it simplifies to EU under an expanded dimension. 3 B. Holevo capacities Next we define a range of capacities for increasing the Holevo information. An ensemble is a set of states that are supplied with probabilities p

Embedding:   0%|          | 6/52554 [00:00<1:35:28,  9.17chunk/s]

vec: [-0.0694849044084549, -0.054925303906202316, 0.01672368496656418, 0.028574293479323387, -0.06741464138031006, 0.07770006358623505, 0.11882872879505157, -0.10363711416721344, 0.04807964712381363, -0.011328935623168945, -0.13002493977546692, -0.02242211438715458, 0.10346117615699768, -0.048338472843170166, 0.012813402339816093, 0.05051150172948837, 0.1025036871433258, 0.050209954380989075, -0.2833574116230011, 0.10337954759597778, 0.13074778020381927, -0.11601972579956055, 0.006583652459084988, 0.02164803259074688, 0.03919254243373871, -0.10555936396121979, 0.04154817759990692, -0.01581302471458912, -0.08584164828062057, -0.13929691910743713, -0.007594684138894081, 0.0655355155467987, 0.06775068491697311, -0.0607585683465004, 0.028758229687809944, -0.04669204354286194, 0.0321556031703949, 0.018333297222852707, 0.03257453814148903, 0.06970539689064026, 0.003948478028178215, 0.07702237367630005, -0.10328246653079987, 0.07317383587360382, -0.05932928994297981, 0.07622262090444565, 0.06

Embedding:   0%|          | 97/52554 [00:10<1:37:42,  8.95chunk/s]


KeyboardInterrupt: 

In [28]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
from pgvector.psycopg2 import register_vector
from tqdm.auto import tqdm
import numpy as np

# — Model setup (unchanged) —
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model     = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text: str) -> np.ndarray:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        emb = model(**inputs).last_hidden_state.mean(dim=1).squeeze()
    return emb.cpu().numpy().astype(np.float32)

# — DB setup —
conn = psycopg2.connect(
    dbname="cleaned_meta_data_db",
    user="rg5073",
    password="rg5073pass",
    host="129.114.27.112",
    port="5432",
)
print("Connected to:", conn.dsn)
register_vector(conn)
cur = conn.cursor()

# 1) Quick column‐type check
cur.execute("SELECT atttypid::regtype, attname FROM pg_attribute a JOIN pg_class c ON a.attrelid=c.oid WHERE c.relname='arxiv_chunks_eval_4' AND attname='chunk_embedding'")
# print("chunk_embedding type:", cur.fetchone())

# 2) Fetch 3 sample rows so we can test a single update
cur.execute("SELECT paper_id, chunk_id, chunk_data FROM arxiv_chunks_eval_4 LIMIT 50000")
samples = cur.fetchall()
# print("Sample rows:", [(p, c) for p,c,_ in samples])

# 3) Smoke‐test updating the very first row with a dummy vector
paper_id, chunk_id, text0 = samples[0]
dummy = [0.123456]*384
cur.execute("""
    UPDATE arxiv_chunks_eval_4
       SET chunk_embedding = %s
     WHERE paper_id = %s AND chunk_id = %s
    RETURNING chunk_embedding

""", (dummy, paper_id, chunk_id))
res = cur.fetchone()
# print("Returned first 2 dims after dummy‐update:", res)

conn.commit()

# 4) Now do your real loop
rows = samples  # replace with your full list or cursor.fetchall()
for paper_id, chunk_id, text in tqdm(rows, desc="Embedding", unit="chunk"):
    vec_list = get_embedding(text).tolist()
    cur.execute(
        """
        UPDATE arxiv_chunks_eval_4
           SET chunk_embedding = %s
         WHERE paper_id = %s AND chunk_id = %s
         RETURNING 1
        """,
        (vec_list, paper_id, chunk_id)
    )
    if cur.rowcount != 1:
        print("⚠️  No update for", paper_id, chunk_id)

conn.commit()
cur.close()
conn.close()


Connected to: user=rg5073 password=xxx dbname=cleaned_meta_data_db host=129.114.27.112 port=5432


Embedding: 100%|██████████| 50000/50000 [1:32:35<00:00,  9.00chunk/s]  


In [7]:
print("DSN:", conn.dsn)
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM arxiv_chunks_eval_4;")
print("Total rows in arxiv_chunks_eval_4:", cur.fetchone()[0])


DSN: user=rg5073 password=xxx dbname=cleaned_meta_data_db host=129.114.27.112 port=5432
Total rows in arxiv_chunks_eval_4: 52554


In [32]:
with open("arxiv_chunks_eval_3.csv", "w", encoding="utf-8") as f:
    cur = conn.cursor()
    cur.copy_expert("""
        COPY arxiv_chunks_eval_3 TO STDOUT WITH CSV HEADER
    """, f)
    cur.close()
