In [None]:
#### pip installs ####
%pip install langchain-experimental langchain-huggingface qdrant-client

In [48]:
#### Imports ####
import os
import uuid
import torch
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv

# Reading Files #
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Embeddings #
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, pipeline

# Database #
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, SparseVector, VectorParams, SparseVectorParams, PointStruct

## Initializing Variables and Models

In [None]:
### Initializations ####
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"
print("Using " + DEVICE)

load_dotenv()

# Text splitter to chunk texts
# Using semantic chunking for best separation of different information to help retrieval
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_amount=0.2, # Higher = fewer chunks
)

# Sparse embedder
# Choices:
#    TF-IDF: Term frequency based
#    BM25: Probabilistic-based
#    SPLADE: Hybrid dense-sparse
SPARSE_MODEL_NAME = "naver/splade_v2_distil"
sparse_tokenizer = AutoTokenizer.from_pretrained(
    SPARSE_MODEL_NAME,
    device=DEVICE,
)
sparse_model = AutoModel.from_pretrained(SPARSE_MODEL_NAME).to(DEVICE)
SPARSE_VECTOR_SIZE = 50000  # Sparse embeddings can get very large
print("Sparse model " + SPARSE_MODEL_NAME + " initialized")

# Dense embedder
DENSE_MODEL_NAME = "intfloat/e5-base"
dense_model = SentenceTransformer(
    DENSE_MODEL_NAME,
    device=DEVICE,
)
DENSE_VECTOR_SIZE = dense_model.get_sentence_embedding_dimension()
print("Dense model " + DENSE_MODEL_NAME + " initialized")

# Database
COLLECTION_NAME = "class_materials"

## Reading and Chunking FIles

In [39]:
#### Reading Files ####
'''
Reads the file at filepath and splits it into chunks.

Args:
    filepath (String): path to file to be read
    text_splitter: Text splitter to use
Returns:
    array of Document objects, each a chunk of file read
'''
def read_file(filepath, text_splitter):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    docs = [Document(page_content=content)]
    chunks = text_splitter.split_documents(docs)

    # Add metadata to each chunk
    for i, chunk in enumerate(chunks):
        # Generate a UUID for the chunk ID
        chunk_id = str(uuid.uuid4())
        # Store both the UUID and original path in metadata
        chunk.metadata["id"] = chunk_id
        chunk.metadata["filepath"] = filepath
        chunk.metadata["chunk_index"] = i
        chunk.metadata["title"] = f"{filepath}_{i}"  # Keep original title for reference
        
    file.close()

    print(f"Num Chunks: {len(chunks)}")
    print(f"Longest Chunk: {len(max(chunks, key=lambda x: len(x.page_content)).page_content)}")

    return chunks

## Embedding Texts + Vectorizing
- sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size, threshold)
- dense_embed(text, dense_model)
- vectorize(text, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model)

In [40]:
#### Embeddings and Vectorizing ####
'''
Sparse embeddings using given model

Args:
    text (String): string to be embedded
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    threshold: Threshold for sparse embedding
Returns:
    sparse embedding of text
'''
def sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size, threshold=0.1, top_k=500):
    tokenized_text = sparse_tokenizer(
        text,
        truncation=False,
        return_tensors="pt",
    ).to(DEVICE)

    input_ids = tokenized_text['input_ids'].squeeze(0)
    total_length = len(input_ids)

    # Split tokens into chunks of 512
    chunk_size = sparse_model.config.max_position_embeddings
    chunks = [input_ids[i:i+chunk_size] for i in range(0, total_length, chunk_size)]

    # Get embeddings for each chunk
    embeddings = []
    for chunk in chunks:
        chunk_input = {'input_ids': chunk.unsqueeze(0)}

        with torch.no_grad():
            outputs = sparse_model(**chunk_input)

        # Aggregate embeddings using mean pooling across token positions
        sparse_embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)

        # Apply ReLU and threshold to enforce sparsity
        sparse_embedding = torch.relu(sparse_embedding)
        sparse_embedding[sparse_embedding < threshold] = 0

        # Get non-zero indices and corresponding values
        non_zero_indices = sparse_embedding.nonzero(as_tuple=True)[0]
        non_zero_values = sparse_embedding[non_zero_indices]

        # Ensure indices fall within the sparse_vector_size
        mask = non_zero_indices < sparse_vector_size
        valid_indices = non_zero_indices[mask]
        valid_values = non_zero_values[mask]

        # Append to embeddings list
        embeddings.append((valid_indices, valid_values))
    
    # Concatenate all embeddings
    concatenated_indices = []
    concatenated_values = []
    for indices, values in embeddings:
        concatenated_indices.extend(indices)
        concatenated_values.extend(values)
    
    # Keep only the top-k most important dimensions for efficiency
    if len(concatenated_indices) > top_k:
        top_k_indices = torch.topk(torch.tensor(concatenated_values), k=top_k).indices
        concatenated_indices = [concatenated_indices[i] for i in top_k_indices]
        concatenated_values = [concatenated_values[i] for i in top_k_indices]

    # Move to CPU and convert to lists for Qdrant compatibility
    return {
        "indices": valid_indices.cpu().numpy().tolist(),
        "values": valid_values.cpu().numpy().tolist()
    }

'''
Dense embeddings

Args:
    text (String): string to be embedded
    dense_model: Model for dense embeddings
Returns:
    dense embedding of text
'''
def dense_embed(text, dense_model, prefix="passage: "):
    # Format required for E5
    formatted_text = prefix + text
    return torch.tensor(dense_model.encode(formatted_text, normalize_embeddings=True)).to(DEVICE)

'''
Generate both sparse and dense vectors for text

Args:
    text (String): string to be embedded
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
Returns:
    tuple of (sparse_embedding, dense_embedding)
'''
def vectorize(text, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, prefix="passage: "):
    # Get sparse embedding with vocabulary-sized vector
    sparse_embedding = sparse_embed(text, sparse_tokenizer, sparse_model, sparse_vector_size)
    
    # Get dense embedding
    dense_embedding = dense_embed(text, dense_model, prefix=prefix)

    return (sparse_embedding, dense_embedding)

## Database Setup
- start_database(recreate, sparse_vector_size, dense_vector_size, collection_name)
- upsert(client, chunks, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name)

In [41]:
#### Database ####
'''
Establishes a connection to Qdrant database

Args:
    recreate (bool): Whether to recreate the collection if it exists
    sparse_vector_size (int): Size of sparse vectors (should match vocab size of tokenizer)
    dense_vector_size (int): Size of dense vectors
    collection_name (str): Name of the collection to use
Returns:
    QdrantClient object with connection to database
'''
def start_database(recreate, sparse_vector_size, dense_vector_size, collection_name):
    try:
        client = QdrantClient(
            url="https://07de6745-b3ea-4156-9daf-a4cbbb339b92.us-east4-0.gcp.cloud.qdrant.io:6333", 
            api_key=os.getenv("QDRANT_API_KEY"),
        )
        print("Connected to Qdrant database")
    except Exception as e:
        print(e)
        return None

    print(f"Using sparse vector size: {sparse_vector_size}, dense vector size: {dense_vector_size}")

    # If recreate=True, recreate the collection
    if recreate:
        client.recreate_collection(
            collection_name=collection_name,
            vectors_config={
                "dense_vector": VectorParams(size=dense_vector_size, distance=Distance.COSINE)
            },
            sparse_vectors_config={
                "sparse_vector": SparseVectorParams(),
            },
        )

    # Create collection if it doesn't exist
    if not client.collection_exists(collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "dense_vector": VectorParams(size=dense_vector_size, distance=Distance.COSINE)
            },
            sparse_vectors_config={
                "sparse_vector": SparseVectorParams(),
            },
        )
        
    print("Existing collections:")
    print(client.get_collections())

    return client

'''
Upserts vectors of text chunks to database

Args:
    client (QdrantClient): client object with connection to database
    chunks ([Document]): array of Document objects to upsert
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
    collection_name: Name of the collection to use
'''
def upsert(client, chunks, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name):
    for i, chunk in tqdm(enumerate(chunks), total=len(chunks), desc="Vectorize + Upsert"):
        sparse, dense = vectorize(chunk.page_content, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, prefix="passage: ")

        # Validate sparse vector
        assert "indices" in sparse and "values" in sparse, "Sparse vector must contain 'indices' and 'values'"
        assert len(sparse["indices"]) == len(sparse["values"]), "Sparse vector indices and values must have same length"
        
        points = [
            PointStruct(
                id = chunk.metadata["id"],
                vector = {"sparse_vector": sparse, "dense_vector": dense.cpu().numpy()},
                payload = {
                    "text": chunk.page_content,
                    "filepath": chunk.metadata["filepath"],
                    "chunk_index": chunk.metadata["chunk_index"],
                    "title": chunk.metadata["title"]
                },
            )
        ]
        client.upsert(
            collection_name=collection_name,
            points=points,
        )

## Context Retrieval
- sparse_query(client, query_sparse, top_k, collection_name)
- dense_query(client, query_dense, top_k, collection_name)
- weighted_score(sparse_score, dense_score, sparse_weight, dense_weight)
- combine_queries(sparse_results, dense_results)
- hybrid_query(query, top_k, sparse_tokenizer, sparse_model, sparse_vector, dense_model, collection_name)

In [55]:
#### Context Retrieval ####
'''
Normalize scores to be between 0 and 1

Args:
    scores: numpy array of scores
Returns:
    numpy array of normalized scores
'''
# Assume sparse_scores and dense_scores are numpy arrays
def normalize(results):
    scores = [point.score for point in results.points]
    min_score = np.min(scores)
    max_score = np.max(scores)
    for point in results.points:
        point.score = (point.score - min_score) / (max_score - min_score)
    return results

'''
Perform a similarity search on sparse embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_sparse: sparse embedding of query
    top_k: number of top results to return
    collection_name: Name of the collection to use
Returns:
    top_k results from similarity search on sparse embeddings
'''
def sparse_query(client, query_sparse, top_k, collection_name):
    results = client.query_points(
        collection_name=collection_name,
        query=SparseVector(
            indices=query_sparse["indices"],
            values=query_sparse["values"],
        ),
        using="sparse_vector",
        limit=top_k,
    )
    return normalize(results)

'''
Perform a similarity search on dense embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_dense: dense embedding of query
    top_k: number of top results to return
    collection_name: Name of the collection to use
Returns:
    top_k results from similarity search on dense embeddings
'''
def dense_query(client, query_dense, top_k, collection_name):
    results = client.query_points(
        collection_name=collection_name,
        query=query_dense,
        using="dense_vector",
        limit=top_k,
    )
    return normalize(results)

'''
Calculates a weighted score for each query result

Args:
    sparse_score: similarity score of sparse embedding
    dense_score: similarity score of dense embedding
Returns:
    weighted score combining both sparse and dense scores
'''
def weighted_score(sparse_score, dense_score, sparse_weight=0.3, dense_weight=0.7):
    return (sparse_weight * sparse_score) + (dense_weight * dense_score)

'''
Combines sparse and dense query results

Args:
    sparse_results: results from sparse query
    dense_results: results from dense query
Returns:
    combined results from sparse and dense queries
'''
def combine_queries(sparse_results, dense_results):
    # Gather scores for all results
    all_results = {}
    for point in dense_results.points:
        all_results[point.id] = {"dense_score": point.score, "sparse_score": 0, "text": point.payload["text"]}

    for point in sparse_results.points:
        if point.id not in all_results:
            all_results[point.id] = {"dense_score": 0, "sparse_score": point.score, "text": point.payload["text"]}
        else:
            all_results[point.id]["sparse_score"] = point.score

    # Weighted results
    scored_results = []
    for result in all_results.keys():
        sparse_score = all_results[result]["sparse_score"]
        dense_score = all_results[result]["dense_score"]
        scored_results.append({
            "score": weighted_score(sparse_score, dense_score, sparse_weight=0.3, dense_weight=0.7),
            "text": all_results[result]["text"],
        })
    
    return scored_results

'''
Perform a hybrid query (sparse and dense) on vector database to provide as context to llm

Args:
    client (QdrantClient): client object with connection to database
    query (String): input from user asked to LLM
    top_k: number of top results to return
    sparse_tokenizer: Tokenizer for sparse embeddings
    sparse_model: Model for sparse embeddings
    sparse_vector_size: Size of sparse vectors
    dense_model: Model for dense embeddings
    collection_name: Name of the collection to use
Returns:
    list of text to serve as context for LLM
'''
def hybrid_query(client, query, top_k, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, collection_name):
    # Vectorize the query
    query_vector = vectorize(query, sparse_tokenizer, sparse_model, sparse_vector_size, dense_model, prefix="query: ")
    query_sparse = query_vector[0]
    query_dense = query_vector[1].cpu().numpy()
    
    # Query database using both sparse and dense embeddings
    sparse_results = sparse_query(client, query_sparse, top_k, collection_name)
    dense_results = dense_query(client, query_dense, top_k, collection_name)

    # Combine and calculate weighted scores for all results
    scored_results = combine_queries(sparse_results, dense_results)

    # Sort in descending order by combined score
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)

    # Trim to top k results
    sorted_results = sorted_results[:top_k]

    # Return results
    return sorted_results

## Main Function
Read and upload all txt files inside /documents/

In [None]:
#### Main Function ####
client = start_database(recreate=True, sparse_vector_size=SPARSE_VECTOR_SIZE, dense_vector_size=DENSE_VECTOR_SIZE, collection_name=COLLECTION_NAME)
print() # For spacing

files = [f for f in os.listdir('./documents') if os.path.isfile(os.path.join('./documents', f))]

for i, filename in enumerate(files):
    print(f"{i}/{len(files)} Uploading {filename} to database...")
    chunks = read_file('./documents/' + filename, text_splitter)
    upsert(client, chunks, sparse_tokenizer, sparse_model, SPARSE_VECTOR_SIZE, dense_model, COLLECTION_NAME)

## Vector Search
Perform sparse, dense, and hybrid vector searches on database

In [None]:
#### Test Retrieval ####
query = "What is a bag of words?"

# Embed the query
query_sparse, query_dense = vectorize(query, sparse_tokenizer, sparse_model, SPARSE_VECTOR_SIZE, dense_model, prefix="query: ")
query_dense = query_dense.cpu().numpy()

# Query the DB
sparse_results = sparse_query(client, query_sparse, 5, COLLECTION_NAME)
dense_results = dense_query(client, query_dense, 5, COLLECTION_NAME)
hybrid_results = hybrid_query(client, query, 5, sparse_tokenizer, sparse_model, SPARSE_VECTOR_SIZE, dense_model, COLLECTION_NAME)

print("Sparse results:")
for i, result in enumerate(sparse_results.points):
    print(f"Result rank {i + 1}:")
    print(result.payload['text'] + '\n')

print("Dense results:")
for i, result in enumerate(dense_results.points):
    print(f"Result rank {i + 1}:")
    print(result.payload['text'] + '\n')

print("Hybrid results")
for i, result in enumerate(hybrid_results):
    print(f"Result rank {i + 1}:")
    print(result['text'] + '\n')