In [10]:
#### pip installs ####
%pip install langchain-experimental langchain-huggingface qdrant-client

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
#### Imports ####
import os
import uuid
import torch
from tqdm import tqdm
from dotenv import load_dotenv

# Reading Files #
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Embeddings #
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

# Database #
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, SparseVector, VectorParams, SparseVectorParams, Prefetch, FusionQuery

## Initializing Variables and Models

In [12]:
### Initializations ####
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"
print("Using " + DEVICE)

load_dotenv()

# Text splitter
# Use a chunker to chunk text
# Different options here: length chunking, character break chunking, semantic chunking
# Using semantic chunking for best separation of different information to help retrieval
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_amount=0.3,
)

# Dense embedder
DENSE_MODEL_NAME = "intfloat/e5-base"
dense_model = SentenceTransformer(
    DENSE_MODEL_NAME,
    device=DEVICE,
)
DENSE_VECTOR_SIZE = dense_model.get_sentence_embedding_dimension()
print("Dense model " + DENSE_MODEL_NAME + " initialized")

# Sparse embedder
SPARSE_MODEL_NAME = "naver/splade_v2_max"
sparse_tokenizer = AutoTokenizer.from_pretrained(
    SPARSE_MODEL_NAME,
    device=DEVICE,
)
sparse_model = AutoModel.from_pretrained(SPARSE_MODEL_NAME).to(DEVICE)
SPARSE_VECTOR_SIZE = 50000  # Sparse embeddings can get very large
print("Sparse model " + SPARSE_MODEL_NAME + " initialized")
# Choices:
#    TF-IDF: Term frequency based
#    BM25: Probabilistic-based
#    SPLADE: Hybrid dense-sparse

# Database
COLLECTION_NAME = "class_materials"
SPARSE_COLLECTION_NAME = "class_materials_sparse"
DENSE_COLLECTION_NAME = "class_materials_dense"

Using mps
Dense model intfloat/e5-base initialized
Sparse model naver/splade_v2_max initialized


## Reading and Chunking FIles

In [13]:
#### Reading Files ####
'''
Reads the file at filepath and splits it into chunks.

Args:
    filepath (String): path to file to be read
    text_splitter: Text splitter to use
Returns:
    array of Document objects, each a chunk of file read
'''
def read_file(filepath, text_splitter):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    docs = [Document(page_content=content)]
    chunks = text_splitter.split_documents(docs)

    # Add metadata to each chunk
    for i, chunk in enumerate(chunks):
        # Generate a UUID for the chunk ID
        chunk_id = str(uuid.uuid4())
        # Store both the UUID and original path in metadata
        chunk.metadata["id"] = chunk_id
        chunk.metadata["filepath"] = filepath
        chunk.metadata["chunk_index"] = i
        chunk.metadata["title"] = f"{filepath}_{i}"  # Keep original title for reference
        
    file.close()

    print(f"Num Chunks: {len(chunks)}")
    print(f"Longest Chunk: {len(max(chunks, key=lambda x: len(x.page_content)).page_content)}")

    return chunks

## Embedding Texts + Vectorizing
- sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size, threshold)
- dense_embed(text, dense_model)
- vectorize(text, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model)

In [18]:
#### Embeddings ####
'''
Sparse embeddings using given model

Args:
    text (String): string to be embedded
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    threshold: Threshold for sparse embedding
Returns:
    sparse embedding of text
'''
def sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size, threshold=0.1):
    inputs = sparse_tokenizer(text, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = sparse_model(**inputs)

    # Directly apply ReLU to hidden states, then threshold
    sparse_embedding = torch.relu(outputs.last_hidden_state)

    # Apply a threshold to create sparsity (zero out small values)
    #sparse_embedding[sparse_embedding < threshold] = 0

    # Extract non-zero indices and values
    non_zero_indices = sparse_embedding.nonzero(as_tuple=True)[1]
    non_zero_values = sparse_embedding[0, non_zero_indices]

    # Ensure valid indices within bounds
    mask = non_zero_indices < sparse_vector_size
    valid_indices = non_zero_indices[mask].cpu().numpy().tolist()
    valid_values = non_zero_values[mask].cpu().numpy().tolist()

    return {
        "indices": valid_indices,
        "values": valid_values
    }

'''
Dense embeddings

Args:
    text (String): string to be embedded
    dense_model: Model for dense embeddings
Returns:
    dense embedding of text
'''
def dense_embed(text, dense_model):
    # Format required for E5
    formatted_text = f"query: {text}"
    return torch.tensor(dense_model.encode(formatted_text, normalize_embeddings=True)).to(DEVICE)

'''
Generate both sparse and dense vectors for text

Args:
    text (String): string to be embedded
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
Returns:
    tuple of (sparse_embedding, dense_embedding)
'''
def vectorize(text, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model):
    # Get sparse embedding with vocabulary-sized vector
    sparse_embedding = sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size)
    
    # Get dense embedding
    dense_embedding = dense_embed(text, dense_model)

    return (sparse_embedding, dense_embedding)

## Database Setup
- start_database(recreate, sparse_vector_size, dense_vector_size, collection_name)
- upsert(client, chunks, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name)

In [19]:
#### Database ####
'''
Establishes a connection to Qdrant database

Args:
    recreate (bool): Whether to recreate the collection if it exists
    sparse_vector_size (int): Size of sparse vectors (should match vocab size of tokenizer)
    dense_vector_size (int): Size of dense vectors
    collection_name (str): Name of the collection to use
Returns:
    QdrantClient object with connection to database
'''
def start_database(recreate, sparse_vector_size, dense_vector_size, collection_name):
    try:
        client = QdrantClient(
            url="https://07de6745-b3ea-4156-9daf-a4cbbb339b92.us-east4-0.gcp.cloud.qdrant.io:6333", 
            api_key=os.getenv("QDRANT_API_KEY"),
        )
        print("Connected to Qdrant database")
    except Exception as e:
        print(e)
        return None

    print(f"Using sparse vector size: {sparse_vector_size}, dense vector size: {dense_vector_size}")

    # If recreate=True, recreate the collection
    if recreate:
        client.recreate_collection(
            collection_name=collection_name,
            vectors_config={
                "dense_vector": VectorParams(size=dense_vector_size, distance=Distance.COSINE)
            },
            sparse_vectors_config={
                "sparse_vector": SparseVectorParams(),
            },
        )

    # Create collection if it doesn't exist
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "dense_vector": VectorParams(size=dense_vector_size, distance=Distance.COSINE)
            },
            sparse_vectors_config={
                "sparse_vector": SparseVectorParams(),
            },
        )
        
    print("Existing collections:")
    print(client.get_collections())

    return client

'''
Upserts vectors of text chunks to database

Args:
    client (QdrantClient): client object with connection to database
    chunks ([Document]): array of Document objects to upsert
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
    collection_name: Name of the collection to use
'''
def upsert(client, chunks, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name):
    # Vectorize each chunk of text
    vectors = []
    for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Vectorizing Chunks"):
        vectors.append(vectorize(chunk.page_content, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model))

    points = []
    for i, chunk in enumerate(chunks):
        sparse, dense = vectors[i]

        # Validate sparse vector
        assert "indices" in sparse and "values" in sparse, "Sparse vector must contain 'indices' and 'values'"
        assert len(sparse["indices"]) == len(sparse["values"]), "Sparse vector indices and values must have same length"

        points.append({
            "id": chunk.metadata["id"],
            "vector": {"sparse_vector": sparse, "dense_vector": dense.cpu().numpy()},
            "payload": {
                "text": chunk.page_content,
                "filepath": chunk.metadata["filepath"],
                "chunk_index": chunk.metadata["chunk_index"],
                "title": chunk.metadata["title"]
            }
        })

    # Upsert dense and sparse vectors to database
    client.upsert(
        collection_name=collection_name,
        points=points,
        wait=True
    )

## Context Retrieval
- sparse_query(client, query_sparse, top_k, collection_name)
- dense_query(client, query_dense, top_k, collection_name)
- weighted_score(sparse_score, dense_score, sparse_weight, dense_weight)
- combine_queries(sparse_results, dense_results)
- hybrid_query(query, top_k, sparse_tokenizer, sparse_model, sparse_vector, dense_model, collection_name)

In [20]:
#### Context Retrieval ####
'''
Perform a similarity search on sparse embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_sparse: sparse embedding of query
    top_k: number of top results to return
    collection_name: Name of the collection to use
Returns:
    top_k results from similarity search on sparse embeddings
'''
def sparse_query(client, query_sparse, top_k, collection_name):
    results = client.query_points(
        collection_name=collection_name,
        query=SparseVector(query_sparse),
        using="sparse_vector",
        limit=top_k,
        with_payload="True",
    )
    print(results)
    return results

'''
Perform a similarity search on dense embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_dense: dense embedding of query
    top_k: number of top results to return
    collection_name: Name of the collection to use
Returns:
    top_k results from similarity search on dense embeddings
'''
def dense_query(client, query_dense, top_k, collection_name):
    results = client.search(
        collection_name=collection_name,
        query_vector=query_dense,
        vector_name="dense_vector",
        limit=top_k,
        with_payload="True",
    )
    return results

'''
Calculates a weighted score for each query result

Args:
    sparse_score: similarity score of sparse embedding
    dense_score: similarity score of dense embedding
Returns:
    weighted score combining both sparse and dense scores
'''
def weighted_score(sparse_score, dense_score, sparse_weight=0.3, dense_weight=0.7):
    return (sparse_weight * sparse_score) + (dense_weight * dense_score)

'''
Combines sparse and dense query results

Args:
    sparse_results: results from sparse query
    dense_results: results from dense query
Returns:
    combined results from sparse and dense queries
'''
def combine_queries(sparse_results, dense_results):
    # Gather scores for all results
    all_results = {}
    for dense in dense_results:
        all_results[dense.id] = {"dense_score": dense.score, "sparse_score": 0, "text": dense.payload["text"]}

    for sparse in sparse_results:
        if sparse.id not in all_results:
            all_results[sparse.id] = {"dense_score": 0, "sparse_score": sparse.score, "text": sparse.payload["text"]}
        else:
            all_results[sparse.id]["sparse_score"] = sparse.score

    # Weighted results
    scored_results = {}
    for result in all_results:
        sparse_score = all_results[result]["sparse_score"]
        dense_score = all_results[result]["dense_score"]
        scored_results[result] = {
            "score": weighted_score(sparse_score, dense_score, sparse_weight=0.3, dense_weight=0.7),
            "text": all_results[result]["text"],
        }
        
    return scored_results

'''
Perform a hybrid query (sparse and dense) on vector database to provide as context to llm

Args:
    client (QdrantClient): client object with connection to database
    query (String): input from user asked to LLM
    top_k: number of top results to return
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
    collection_name: Name of the collection to use
Returns:
    list of text to serve as context for LLM
'''
def hybrid_query(client, query, top_k, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name):
    # Vectorize the query
    query_vector = vectorize(query, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model)
    query_sparse = query_vector[0].cpu().numpy()
    query_dense = query_vector[1].cpu().numpy()
    
    # Query database using both sparse and dense embeddings
    sparse_results = sparse_query(client, query_sparse, top_k, collection_name)
    dense_results = dense_query(client, query_dense, top_k, collection_name)

    # Combine and calculate weighted scores for all results
    scored_results = combine_queries(sparse_results, dense_results)

    # Sort in descending order by combined score
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)
    
    # Return results
    text_results = []
    for i, result in enumerate(sorted_results):
        if i == top_k:
            break
        text_results.append(sorted_results[result]["text"])
        
    return text_results

In [None]:
#### Main Function ####
client = start_database(recreate=True, sparse_vector_size=SPARSE_VECTOR_SIZE, dense_vector_size=DENSE_VECTOR_SIZE, collection_name=COLLECTION_NAME)
chunks = read_file('./documents/cse156Lecture1Slides.txt', text_splitter)
upsert(client, chunks, sparse_tokenizer, sparse_model, SPARSE_VECTOR_SIZE, dense_model, COLLECTION_NAME)

Connected to Qdrant database
Using sparse vector size: 50000, dense vector size: 768


  client.recreate_collection(


Existing collections:
collections=[CollectionDescription(name='class_materials')]


In [None]:
#### Test Retrieval ####
query = "What is the main topic of the lecture?"
sparse_embed, dense_embed = vectorize(query, sparse_tokenizer, sparse_model, dense_model, SPARSE_VECTOR_SIZE)
print(sparse_embed)
sparse_results = sparse_query(client, sparse_embed, 5, COLLECTION_NAME)
dense_results = dense_query(client, dense_embed, 5, COLLECTION_NAME)
hybrid_results = hybrid_query(client, query, 5, sparse_tokenizer, sparse_model, dense_model, SPARSE_VECTOR_SIZE, COLLECTION_NAME)

