# Generate embeddings from text files in the given directory and store in Qdrant

This program indexes RFC text files in a local directory into a locally running Qdrant Vector DB.

I downloaded RFCs from: https://www.rfc-editor.org/retrieve/bulk/

I placed them in my directory `~/data/RFCs_8501_latest`. Removed the `*.pdf` and kept only `*.txt`.

Start [Qdrant](https://github.com/qdrant/qdrant) in your laptop:

In [1]:
%pip install qdrant-client sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import glob
from typing import List, Dict, Any
import qdrant_client
from qdrant_client.http import models
from sentence_transformers import SentenceTransformer

def load_text_files(directory_path: str) -> Dict[str, str]:
    """Load all text files from a directory."""
    text_files = {}
    for file_path in glob.glob(os.path.join(directory_path, "*.txt")):
        file_name = os.path.basename(file_path)
        with open(file_path, "r", encoding="utf-8") as file:
            text_files[file_name] = file.read()
    return text_files

def generate_embeddings(texts: List[str], model_name: str = "all-MiniLM-L6-v2") -> List[List[float]]:
    """Generate embeddings for a list of texts using a sentence transformer model."""
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts)
    return embeddings.tolist()

def store_in_qdrant(
    client: qdrant_client.QdrantClient,
    collection_name: str,
    texts: Dict[str, str],
    embeddings: List[List[float]],
    vector_size: int
) -> None:
    """Store texts and their embeddings in Qdrant."""
    # Create collection if it doesn't exist
    collections = client.get_collections().collections
    collection_exists = any(collection.name == collection_name for collection in collections)
    
    if not collection_exists:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
    
    # Prepare points to upsert
    points = []
    for i, (file_name, text) in enumerate(texts.items()):
        points.append(
            models.PointStruct(
                id=i,
                vector=embeddings[i],
                payload={"file_name": file_name, "text": text}
            )
        )
    
    # Upsert points to the collection
    client.upsert(
        collection_name=collection_name,
        points=points
    )

def main():
    # Configuration
    directory_path = os.environ.get('HOME') + "/data/RFCs_8501_latest"
    collection_name = "text_embeddings"
    model_name = "all-MiniLM-L6-v2"  # You can use other models like "all-mpnet-base-v2" for better quality
    
    # Connect to Qdrant (local or cloud)
    client = qdrant_client.QdrantClient(
        url="http://localhost:6333",  # Change this if using Qdrant Cloud
        # api_key="your-api-key"  # Uncomment and add your API key if using Qdrant Cloud
    )
    
    # Load text files
    text_files = load_text_files(directory_path)
    
    if not text_files:
        print("No text files found in the specified directory.")
        return
    
    # Generate embeddings
    file_names = list(text_files.keys())
    texts = list(text_files.values())
    embeddings = generate_embeddings(texts, model_name)
    
    # Get vector size from the generated embeddings
    vector_size = len(embeddings[0])
    
    # Store in Qdrant
    store_in_qdrant(client, collection_name, text_files, embeddings, vector_size)
    
    print(f"Successfully stored {len(text_files)} text files as embeddings in Qdrant collection '{collection_name}'.")


main()

Successfully stored 389 text files as embeddings in Qdrant collection 'text_embeddings'.


Next, search the Vector DB to most matched files for the query:

In [3]:
import qdrant_client
from qdrant_client.models import QueryRequest
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any

def search_documents(
    query: str,
    client: qdrant_client.QdrantClient,
    collection_name: str,
    model_name: str = "all-MiniLM-L6-v2",
    limit: int = 5
) -> List[Dict[str, Any]]:
    """
    Search documents in Qdrant collection using semantic similarity.
    
    Args:
        query: The search query text
        client: Qdrant client instance
        collection_name: Name of the collection to search
        model_name: Name of the embedding model (should match the one used for indexing)
        limit: Maximum number of results to return
        
    Returns:
        List of matching documents with their metadata and similarity scores
    """
    # Generate embedding for the query
    model = SentenceTransformer(model_name)
    query_embedding = model.encode(query).tolist()
    
    # Search in Qdrant
    search_results = client.search( # deprecated, alternate use to be figured out!
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=limit
    )

    # search_results = client.query_points(
    #     collection_name=collection_name,
    #     query=query_embedding,
    #     limit=limit
    # )
    
    # Format results
    results = []
    for result in search_results:
        results.append({
            "file_name": result.payload.get("file_name"),
            "similarity_score": result.score,
            "text_preview": result.payload.get("text", "")[:200] + "..." if len(result.payload.get("text", "")) > 200 else result.payload.get("text", "")
        })
    
    return results

def main():
    # Configuration
    collection_name = "text_embeddings"
    model_name = "all-MiniLM-L6-v2"  # Should match the model used for indexing
    
    # Connect to Qdrant
    client = qdrant_client.QdrantClient(
        url="http://localhost:6333",  # Change this if using Qdrant Cloud
        # api_key="your-api-key"  # Uncomment and add your API key if using Qdrant Cloud
    )
    
    # User input for search query
    # query = input("Enter your search query: ")
    query = "imap"
    
    # Search documents
    results = search_documents(
        query=query,
        client=client,
        collection_name=collection_name,
        model_name=model_name
    )
    
    # Display results
    if results:
        print(f"\nFound {len(results)} relevant documents:")
        for i, result in enumerate(results, 1):
            print(f"\n--- Result {i} ---")
            print(f"File: {result['file_name']}")
            print(f"Similarity score: {result['similarity_score']:.4f}")
            print(f"Preview: {result['text_preview']}")
    else:
        print("No matching documents found.")

main()


Found 5 relevant documents:

--- Result 1 ---
File: rfc8514.txt
Similarity score: 0.4574
Preview: 





Internet Engineering Task Force (IETF)                          S. Bosch
Request for Comments: 8514                               Open Xchange Oy
Category: Standards Track                       ...

--- Result 2 ---
File: rfc8508.txt
Similarity score: 0.4520
Preview: 





Internet Engineering Task Force (IETF)                         S. Brandt
Request for Comments: 8508                                       Verizon
Category: Standards Track                       ...

--- Result 3 ---
File: rfc8970.txt
Similarity score: 0.4264
Preview: ﻿



Internet Engineering Task Force (IETF)                        M. Slusarz
Request for Comments: 8970                             Open-Xchange Inc.
Category: Standards Track                        ...

--- Result 4 ---
File: rfc8579.txt
Similarity score: 0.4140
Preview: 





Internet Engineering Task Force (IETF)                          S. Bosch


  search_results = client.search( # deprecated, alternate use to be figured out!
