In [None]:
#### pip installs ####
%pip install langchain-experimental langchain-huggingface qdrant-client

In [None]:
#### Imports ####
import os
import torch

# Reading Files #
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

# Embeddings #
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

# Database #
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

In [None]:
### Initializations ####
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)

# Text splitter
# Use a chunker to chunk text
# Different options here: length chunking, character break chunking, semantic chunking
# Using semantic chunking for best separation of different information to help retrieval
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = SemanticChunker(embeddings=embeddings)

# Dense embedder
DENSE_MODEL_NAME = "intfloat/e5-base"
dense_model = SentenceTransformer(
    DENSE_MODEL_NAME,
    device="cpu",
)
DENSE_VECTOR_SIZE = dense_model.get_sentence_embedding_dimension()
print("Dense model " + DENSE_MODEL_NAME + " initialized")

# Sparse embedder
SPARSE_MODEL_NAME = "naver/splade_v2_max"
sparse_tokenizer = AutoTokenizer.from_pretrained(
    SPARSE_MODEL_NAME,
    device="cpu",
)
sparse_model = AutoModel.from_pretrained(SPARSE_MODEL_NAME).to(DEVICE)
SPARSE_VECTOR_SIZE = 50000  # Sparse embeddings can get very large
print("Sparse model " + SPARSE_MODEL_NAME + " initialized")
# Choices:
#    TF-IDF: Term frequency based
#    BM25: Probabilistic-based
#    SPLADE: Hybrid dense-sparse

# Database
COLLECTION_NAME = "class_materials"

In [None]:
#### Reading Files ####
'''
Reads the file at filepath and splits it into chunks.

Args:
    filepath (String): path to file to be read
Returns:
    array of Document objects, each a chunk of file read
'''
def read_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    docs = [Document(page_content=content)]
    chunks = text_splitter.split_documents(docs)

    # Add title to each chunk to use as id in database
    for i, chunk in enumerate(chunks):
        chunk.metadata["title"] = f"{filepath}_{i}"
        print(chunk.page_content)
        print('---')
        
    file.close()

    return chunks

In [None]:
#### Embeddings ####
'''
Sparse embeddings using E5-base model

Args:
    text (String): string to be embedded
Returns:
    sparse embedding of text
'''
def sparse_embed(text):
    inputs = sparse_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = sparse_model(**inputs)

    sparse_embedding = torch.log(1 + torch.relu(outputs.last_hidden_state).sum(dim=1))
    return sparse_embedding.squeeze().numpy()

'''
Dense embeddings

Args:
    text (String): string to be embedded
Returns:
    dense embedding of text
'''
def dense_embed(text):
    # Format required for E5
    formatted_text = f"query: {text}"
    return dense_model.encode(formatted_text, normalize_embeddings=True)

In [None]:
#### Vectorizing Chunks ####
'''
Concatenate sparse and dense embedding vectors and return combined vector

Args:
    sparse_embedding (numpy vector)
    dense_embedding (numpy vector)
Returns:
    vectors of each chunk
'''
def vectorize(text):
    sparse_embedding = sparse_embed(text)
    dense_embedding = dense_embed(text)
    combined_vector = np.concatenate((sparse_embedding, dense_embedding))
    return combined_vector

In [None]:
#### Database ####
'''
Establishes a connection to Qdrant database

Args:
    none
Returns:
    QdrantClient object with connection to database
'''
def start_database(recreate):
    client = QdrantClient(
        url="https://07de6745-b3ea-4156-9daf-a4cbbb339b92.us-east4-0.gcp.cloud.qdrant.io:6333", 
        api_key="<your-token>",
    )

    # If recreate=True, recreate the collection
    if recreate:
        client.recreate_collection(
            collection_name=COLLECTION_NAME,
            vectors_config={
                "sparse_vector": VectorParams(size=SPARSE_VECTOR_SIZE, distance=Distance.DOT),
                "dense_vector": VectorParams(size=DENSE_VECTOR_SIZE, distance=Distance.COSINE),
            },
        )

    # Create collection if it doesn't exist
    if not client.collection_exists(COLLECTION_NAME):
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config={
                "sparse_vector": VectorParams(size=SPARSE_VECTOR_SIZE, distance=Distance.DOT),
                "dense_vector": VectorParams(size=DENSE_VECTOR_SIZE, distance=Distance.COSINE),
            },
        )
    
    print(client.get_collections())

    return client

'''
Upserts vectors of text chunks to database

Args:
    client (QdrantClient): client object with connection to database
    chunks ([Document]): array of Document objects to upsert
'''
def upsert(client, chunks):
    # Vectorize each chunk of text
    vectors = [vectorize(chunk.page_content) for chunk in chunks]

    # Upsert dense and sparse vectors to database
    client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            {
                "id": chunk.metadata["title"],
                "vector": {"sparse_vector": vectors[i][0], "dense_vector": vectors[i][1]},
                "payload": {"text": chunk.page_content},
            }
            for i, chunk in enumerate(chunks)
        ],
    )

In [None]:
#### Context Retrieval ####
'''
Perform a similarity search on sparse embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_sparse: sparse embedding of query
    top_k: number of top results to return
Returns:
    top_k results from similarity search on sparse embeddings
'''
def sparse_query(client, query_sparse, top_k):
    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_sparse,
        limit=top_k,
        vector_type="sparse",
        with_payload="True",
    )
    return results

'''
Perform a similarity search on dense embeddings

Args:
    client (QdrantClient): client object with connection to database
    query_dense: dense embedding of query
    top_k: number of top results to return
Returns:
    top_k results from similarity search on dense embeddings
'''
def dense_query(client, query_dense, top_k):
    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_dense,
        limit=top_k,
        vector_type="dense",
        with_payload="True",
    )
    return results

'''
Calculates a weighted score for each query result

Args:
    sparse_score: similarity score of sparse embedding
    dense_score: similarity score of dense embedding
Returns:
    weighted score combining both sparse and dense scores
'''
def weighted_score(sparse_score, dense_score, sparse_weight=0.3, dense_weight=0.7):
    return (sparse_weight * sparse_score) + (dense_weight * dense_score)

'''
Combines sparse and dense query results
'''

def combine_queries(sparse_results, dense_results):
    # Gather scores for all results
    all_results = {}
    for dense in dense_results:
        all_results[dense.id] = {"dense_score": dense.score, "sparse_score": 0, "text": dense.payload["text"]}

    for sparse in sparse_results:
        if sparse.id not in all_results:
            all_results[sparse.id] = {"dense_score": 0, "sparse_score": sparse.score, "text": sparse.payload["text"]}
        else:
            all_results[sparse.id]["sparse_score"] = sparse.score

    # Weighted results
    scored_results = {}
    for result in all_results:
        sparse_score = all_results[result]["sparse_score"]
        dense_score = all_results[result]["dense_score"]
        scored_results[result] = {
            "score": weighted_score(sparse_score, dense_score),
            "text": all_results[result]["text"],
        }
        
    return scored_results

'''
Perform a hybrid query (sparse and dense) on vector database to provide as context to llm

Args:
    client (QdrantClient): client object with connection to database
    query (String): input from user asked to LLM
Returns:
    list of text to serve as context for LLM
'''
def hybrid_query(client, query, top_k):
    # Vectorize the query
    query_vector = vectorize(query)
    query_sparse = query_vector[0]
    query_dense = query_vector[1]
    
    # Query database using both sparse and dense embeddings
    sparse_results = sparse_query(client, query_sparse, top_k)
    dense_results = dense_query(client, query_dense, top_k)

    # Combine and calculate weighted scores for all results
    scored_results = combine_queries(sparse_results, dense_results)

    # Sort in descending order by combined score
    sorted_results = sorted(scored_results, key=lambda x: x["score"], reverse=True)
    
    # Return results
    text_results = []
    for i, result in enumerate(sorted_results):
        if i == top_k:
            break
        text_results.append(sorted_results[result]["text"])
        
    return text_results

In [None]:
#### Main Function ####
client = start_database(recreate=True)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print(filepath)
        chunks = read_file(filepath)
        upsert(chunks)