In [None]:
!pip install qdrant-client datasets InstructorEmbedding pandas numpy



In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "Qdrant/arxiv-titles-instructorxl-embeddings", split="train", streaming=True
)

In [11]:
dataset_iterator = iter(dataset)
train_dataset = [next(dataset_iterator) for _ in range(60000)]
test_dataset = [next(dataset_iterator) for _ in range(1000)]

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(
    url="XXXXXXXXXXXXXXXXXXXXXXXXXX", 
    api_key="XXXXXXXXXXXXXXXXXXXXXXXX",
    timeout=120.0,
)

COLL_DENSE = "arxiv-titles-dense"

# Create once; skip if it already exists
if not client.collection_exists(COLL_DENSE):
    client.create_collection(
        collection_name=COLL_DENSE,
        vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE),
    )

In [16]:
from qdrant_client.models import PointStruct
import uuid, itertools

def batched(iterable, n=50):          # simple chunker
    it = iter(iterable)
    while (chunk := list(itertools.islice(it, n))):
        yield chunk

points = (
    PointStruct(
        id=str(uuid.uuid4()),
        vector=item["vector"],            # 768-element list
        payload={"title": item["title"], "paper_id": i}
    )
    for i, item in enumerate(train_dataset)
)

for batch in batched(points, 100):
    client.upsert(collection_name=COLL_DENSE, points=batch)


In [18]:
def search_by_title(title:str, k=10):
    # embed the query title locally (use the same Instructor-XL model)
    from sentence_transformers import SentenceTransformer
    ENCODER_PATH = "models/instructor-xl"
    model = SentenceTransformer(ENCODER_PATH, device="cpu")
    vec = model.encode([["Represent the Science title:", title]])[0]

    hits = client.search(COLL_DENSE, query_vector=vec, limit=k, with_payload=True)
    return [(h.payload["title"], h.score) for h in hits]

print(search_by_title("Quantum error correction in noisy qubits"))


  hits = client.search(COLL_DENSE, query_vector=vec, limit=k, with_payload=True)


[('Quantum Error Correction via Noise Guessing Decoding', 0.8623227), ('Error correction in ensemble registers for quantum repeaters and quantum\n  computers', 0.8551971), ('Quantum data processing and error correction', 0.8479152), ('Introduction to Quantum Error Correction', 0.84624475), ('Introduction to Quantum Error Correction', 0.84624475), ('Quantum Error Correction', 0.84499454), ('Quantum Error Correction', 0.84499454), ('Quantum Error Correction of a Qubit Loss in an Addressable Atomic System', 0.8444803), ('Quantum f-divergences and error correction', 0.8439431), ('Quantum f-divergences and error correction', 0.8439431)]


### 2.1 Generate or import user ratings
If you have real clicks/ratings, load them into a pandas DataFrame with columns user_id paper_id rating.

In [19]:
import numpy as np, pandas as pd
NUM_USERS = 2000
rng = np.random.default_rng(0)

rows = []
for u in range(NUM_USERS):
    n_items = rng.integers(20, 120)        # each user rates 20-120 papers
    items   = rng.choice(len(train_dataset), n_items, replace=False)
    ratings = rng.normal(0, 1, n_items)    # mean-0 std-1 (already normalized)
    rows.extend(zip([u]*n_items, items, ratings))

ratings_df = pd.DataFrame(rows, columns=["user_id","paper_id","rating"])

#### 2.2 Convert each user to a sparse vector

In [20]:
from collections import defaultdict
user_sparse = defaultdict(lambda: {"indices":[], "values":[]})

for r in ratings_df.itertuples():
    user_sparse[r.user_id]["indices"].append(int(r.paper_id))
    user_sparse[r.user_id]["values"].append(float(r.rating))


#### 2.3 Store sparse vectors in Qdrant

In [22]:
from qdrant_client.models import SparseVectorParams, SparseVector, PointStruct

COLL_SPARSE = "user-paper-sparse"
if not client.collection_exists(COLL_SPARSE):
    client.create_collection(
        collection_name=COLL_SPARSE,
        vectors_config={},        # no dense vectors here
        sparse_vectors_config={"ratings": SparseVectorParams()}   # default index in-RAM
    )

def gen_points():
    for uid, vec in user_sparse.items():
        yield PointStruct(
            id=uid,
            vector={"ratings": SparseVector(indices=vec["indices"], values=vec["values"])},
            payload={"user_id": uid, "rated": vec["indices"]}
        )

client.upload_points(COLL_SPARSE, points=gen_points())


#### 2.4 Collaborative-filter query

In [23]:
def recommend_cf(my_ratings:dict, k_users=20):
    idxs, vals = zip(*my_ratings.items())
    query_vec  = SparseVector(indices=list(idxs), values=list(vals))

    users = client.query_points(
        collection_name=COLL_SPARSE,
        query=query_vec,
        using="ratings",
        limit=k_users
    ).points

    # gather scores for unseen items
    from collections import Counter
    scores = Counter()
    for u in users:
        for pid in u.payload["rated"]:
            if pid not in my_ratings:
                scores[pid] += u.score
    return scores.most_common(10)


#### 1. Content-Based Search (Dense Vectors)
This finds papers similar to a given paper or query text using the title embedding.

In [24]:
from InstructorEmbedding import INSTRUCTOR
from sentence_transformers import SentenceTransformer

def search_by_title(title, k=10):
    from sentence_transformers import SentenceTransformer
    ENCODER_PATH = "models/instructor-xl"
    model = SentenceTransformer(ENCODER_PATH, device="cpu")
    vec = model.encode([["Represent the Science title:", title]])[0]  # Get dense embedding
    hits = client.search(
        collection_name=COLL_DENSE,
        query_vector=vec,
        limit=k,
        with_payload=True
    )
    return [(hit.payload["title"], hit.score) for hit in hits]

results = search_by_title("Neural networks for image classification")
print(results)


  hits = client.search(


[('NIST: An Image Classification Network to Image Semantic Retrieval', 0.8323139), ('NIST: An Image Classification Network to Image Semantic Retrieval', 0.8323139), ('Combined convolutional and recurrent neural networks for hierarchical\n  classification of images', 0.8287244), ('Design of Kernels in Convolutional Neural Networks for Image\n  Classification', 0.82021785), ('Design of Kernels in Convolutional Neural Networks for Image\n  Classification', 0.82021785), ('Pollen Grain Microscopic Image Classification Using an Ensemble of\n  Fine-Tuned Deep Convolutional Neural Networks', 0.81991637), ('Pollen Grain Microscopic Image Classification Using an Ensemble of\n  Fine-Tuned Deep Convolutional Neural Networks', 0.81991637), ('RAIN: A Simple Approach for Robust and Accurate Image Classification\n  Networks', 0.8181366), ('RAIN: A Simple Approach for Robust and Accurate Image Classification\n  Networks', 0.8181366), ('Compressive spectral image classification using 3D coded convolutio

#### 2. Collaborative Filtering Search (Sparse Vectors)

In [25]:
from qdrant_client.models import SparseVector

def recommend_cf(my_ratings, k_users=20):
    indices, values = zip(*my_ratings.items())
    query_vec = SparseVector(indices=list(indices), values=list(values))
    users = client.query_points(
        collection_name=COLL_SPARSE,
        query=query_vec,
        using="ratings",
        limit=k_users
    ).points

    from collections import Counter
    scores = Counter()
    for user in users:
        for pid in user.payload["rated"]:
            if pid not in my_ratings:
                scores[pid] += user.score
    return scores.most_common(10)

# Example: rate papers 123 (like) and 456 (dislike)
my_ratings = {123: 1.0, 456: -1.0}
recommended_papers = recommend_cf(my_ratings)
print(recommended_papers)


[(7033, -0.28070956), (24221, -0.28070956), (25612, -0.28070956), (41940, -0.28070956), (4969, -0.28070956), (32550, -0.28070956), (5603, -0.28070956), (1598, -0.28070956), (33134, -0.28070956), (52587, -0.28070956)]


#### Step 1: Generate and Normalize Synthetic Ratings

In [26]:
import numpy as np
import pandas as pd

NUM_USERS = 100
NUM_PAPERS = len(train_dataset)
rng = np.random.default_rng(0)

rows = []
for u in range(NUM_USERS):
    n_items = rng.integers(20, 80)
    items = rng.choice(NUM_PAPERS, n_items, replace=False)
    ratings = rng.normal(0, 1, n_items)  # mean 0, std 1
    rows.extend(zip([u]*n_items, items, ratings))

ratings_df = pd.DataFrame(rows, columns=["user_id", "paper_id", "rating"])
# (No need to further normalize, you already have mean 0, std 1 random ratings!)


#### Step 2: Build User Sparse Vectors

In [27]:
from collections import defaultdict

user_sparse = defaultdict(lambda: {"indices": [], "values": []})
for row in ratings_df.itertuples():
    user_sparse[row.user_id]["indices"].append(int(row.paper_id))
    user_sparse[row.user_id]["values"].append(float(row.rating))


#### Step 3: Create Qdrant Sparse Collection and Upload

In [28]:
from qdrant_client.models import SparseVectorParams, SparseVector, PointStruct

COLL_SPARSE = "user-paper-sparse"
if not client.collection_exists(COLL_SPARSE):
    client.create_collection(
        collection_name=COLL_SPARSE,
        vectors_config={},  # no dense vectors
        sparse_vectors_config={"ratings": SparseVectorParams()}
    )

def gen_points():
    for uid, vec in user_sparse.items():
        yield PointStruct(
            id=uid,
            vector={"ratings": SparseVector(indices=vec["indices"], values=vec["values"])},
            payload={"user_id": uid, "rated": vec["indices"]}
        )

client.upload_points(COLL_SPARSE, points=gen_points())


#### Step 4: Implement the Collaborative Filtering Query Function
Goal: For a demo, pick a realistic user and use their ratings as the query so you’re guaranteed overlap!

In [29]:
from qdrant_client.models import SparseVector

def recommend_cf(my_ratings, k_users=10, top_n=10):
    indices, values = zip(*my_ratings.items())
    query_vec = SparseVector(indices=list(indices), values=list(values))
    users = client.query_points(
        collection_name=COLL_SPARSE,
        query=query_vec,
        using="ratings",
        limit=k_users
    ).points

    # Aggregate unseen paper recommendations
    from collections import Counter
    scores = Counter()
    for user in users:
        for pid in user.payload["rated"]:
            if pid not in my_ratings:
                scores[pid] += user.score
    return scores.most_common(top_n)


#### Step 5: Demo—Pick Example Ratings from Existing Data
Goal: Pick an actual user and use their ratings for guaranteed overlap.

In [30]:
example_user = ratings_df['user_id'].iloc[0]
example_ratings = dict(zip(
    ratings_df[ratings_df['user_id'] == example_user]['paper_id'],
    ratings_df[ratings_df['user_id'] == example_user]['rating']
))

# For clarity, you may want to only use a subset (e.g., 5 ratings) in your query:
my_ratings_demo = dict(list(example_ratings.items())[:5])
print("Demo query ratings:", my_ratings_demo)

recommendations = recommend_cf(my_ratings_demo)
print("Paper IDs and scores:", recommendations)


Demo query ratings: {43728: 0.6836861907765345, 41122: 1.0039615758421696, 40220: -0.6179070447076008, 1698: 1.8220113633283233, 51404: -1.3204309700132935}
Paper IDs and scores: [(34731, 6.9204383), (38173, 6.9204383), (10504, 6.9204383), (990, 6.9204383), (51757, 6.9204383), (50726, 6.9204383), (48285, 6.9204383), (28847, 6.9204383), (5353, 6.9204383), (45860, 6.9204383)]
