# 04 - ETL Specification Ingestion

This notebook ingests raw design documents (Markdown, Text) from the `design_docs/` folder and indexes them into Qdrant.

In [2]:
import os
import sys
import glob

# Add parent directory to path to import src
sys.path.append('..')

# Libraries
from langchain_community.document_loaders import DirectoryLoader, TextLoader, UnstructuredMarkdownLoader
# UPDATED IMPORT: Use langchain_text_splitters directly
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import Qdrant
from langchain_ollama import OllamaEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.http import models

# Configuration
DOCS_DIR = "../design_docs"
QDRANT_URL = "http://qdrant:6333"
COLLECTION_NAME = "etl_specs"
EMBEDDING_MODEL = "nomic-embed-text" # Ensure this model is pulled in Ollama

print(f"Checking Document Directory: {os.path.abspath(DOCS_DIR)}")
if not os.path.exists(DOCS_DIR):
    os.makedirs(DOCS_DIR)
    print("Created missing directory.")

# 1. Load Documents
print("Loading documents...")
docs = []

# Load Markdown
try:
    md_loader = DirectoryLoader(DOCS_DIR, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
    md_docs = md_loader.load()
    print(f"Loaded {len(md_docs)} Markdown files.")
    docs.extend(md_docs)
except Exception as e:
    print(f"Error loading Markdown: {e}")

# Load Text Files (Supplementary Notes)
try:
    txt_loader = DirectoryLoader(DOCS_DIR, glob="**/*.txt", loader_cls=TextLoader)
    txt_docs = txt_loader.load()
    print(f"Loaded {len(txt_docs)} Text files.")
    docs.extend(txt_docs)
except Exception as e:
    print(f"Error loading Text: {e}")

print(f"Total Documents to Index: {len(docs)}")

if len(docs) > 0:
    # 2. Split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    print(f"Created {len(splits)} chunks.")

    # 3. Index
    print("Indexing to Qdrant...")
    embeddings = OllamaEmbeddings(base_url="http://host.docker.internal:11434", model=EMBEDDING_MODEL)
    
    # Initialize Client
    client = QdrantClient(url=QDRANT_URL)
    
    # Manual Collection Management
    try:
        client.delete_collection(COLLECTION_NAME)
        print(f"Deleted existing collection: {COLLECTION_NAME}")
    except Exception:
        pass
    
    print(f"Creating collection: {COLLECTION_NAME}")
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE) # nomic-embed-text is 768 dims
    )

    # Instantiate Vectorstore directly with client
    qdrant = Qdrant(
        client=client,
        collection_name=COLLECTION_NAME,
        embeddings=embeddings
    )
    
    # Add documents
    qdrant.add_documents(splits)

    print("Ingestion Complete! Data available in Qdrant.")
else:
    print("No documents found to ingest. Please add .md or .txt files to design_docs/")

Checking Document Directory: /workspace/design_docs
Loading documents...
Loaded 1 Markdown files.
Loaded 1 Text files.
Total Documents to Index: 2
Created 2 chunks.
Indexing to Qdrant...
Deleted existing collection: etl_specs
Creating collection: etl_specs
Ingestion Complete! Data available in Qdrant.
