In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

FIRECRAWL_API_KEY = os.environ.get("FIRECRAWL_API_KEY")
URL = "https://erhaultimate.co.id/"
VECTOR_SIZE = 768

In [3]:
from firecrawl import Firecrawl

app = Firecrawl(api_key=FIRECRAWL_API_KEY)

In [4]:
results = app.crawl(url=URL, allow_subdomains=True, crawl_entire_domain=True)
results

KeyboardInterrupt: 

In [None]:
from qdrant_client import QdrantClient

qdrant_host = "http://localhost:6333"
collection_name = os.getenv("QDRANT_COLLECTION")
qdrant_client = QdrantClient(url=qdrant_host)

In [None]:
from qdrant_client.http.models import VectorParams, Distance

collections = qdrant_client.get_collections()
existing_collections = [collection.name for collection in collections.collections]

if collection_name not in existing_collections:
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
    )

In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

docs_to_process = []
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=250
)
for document in results.data:
    documents = Document(page_content=document.markdown)
    docs_to_process.append(documents)

chunks = splitter.split_documents(documents=docs_to_process)
chunk_texts = [chunk.page_content for chunk in chunks]
print(f"Split into {len(chunk_texts)} chunks.")

Split into 60 chunks.


In [None]:
import re

current_headers = {
    "h1": None,
    "h2": None,
    "h3": None,
    "h4": None,
    "h5": None,
}

processed_docs = []
previous_clean_text = ""

for chunk in chunk_texts:
    h1 = re.findall(r'^\s*#\s+(.*?)\n', chunk, re.MULTILINE)
    h2 = re.findall(r'^\s*##\s+(.*?)\n', chunk, re.MULTILINE)
    h3 = re.findall(r'^\s*###\s+(.*?)\n', chunk, re.MULTILINE)
    h4 = re.findall(r'^\s*####\s+(.*?)\n', chunk, re.MULTILINE)
    h5 = re.findall(r'^\s*#####\s+(.*?)\n', chunk, re.MULTILINE)

    if h1:
        current_headers["h1"] = h1[-1].strip()
        current_headers["h2"], current_headers["h3"], current_headers["h4"], current_headers["h5"] = None, None, None, None
    if h2:
        current_headers["h2"] = h2[-1].strip()
        current_headers["h3"], current_headers["h4"], current_headers["h5"] = None, None, None
    if h3:
        current_headers["h3"] = h3[-1].strip()
        current_headers["h4"], current_headers["h5"] = None, None
    if h4:
        current_headers["h4"] = h4[-1].strip()
        current_headers["h5"] = None
    if h5:
        current_headers["h5"] = h5[-1].strip()

    metadata = {k: v for k, v in current_headers.items() if v} # Only include non-None headers
    metadata["source_url"] = URL

    clean_text = chunk
        
    clean_text = re.sub(r'!\[.*?\]\(.*?\)', '', clean_text)
    clean_text = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*?\)', '', clean_text)
    clean_text = re.sub(r'\[(Show More|Previous|Next|FILTER|LIHAT SELENGKAPNYA|Send|Prev)\]\(.*?\)', '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\[(\*?.*?\*?)\]\(.*?\)', r'\1', clean_text)
    clean_text = re.sub(r'\\{2,}\n', ' ', clean_text)
    clean_text = re.sub(r'^\s*#+\s+.*\n', '', clean_text, flags=re.MULTILINE)
    ui_noise = r'Your browser does not support the video tag\.|Start Your Change!|Previous|Next|Categories|Problem|SEARCH|IDR\.|LIHAT SELENGKAPNYA|OUR JOURNEY|-\s\d'
    clean_text = re.sub(ui_noise, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\n\s*-\s*\d+\s*', '', clean_text)
    clean_text = re.sub(r'\n\s*\d+K\s*\n', '\n', clean_text)
    clean_text = re.sub(r'(\n\s*){3,}', '\n\n', clean_text)
    clean_text = clean_text.strip()

    # --- 3. REDUNDANCY & QUALITY CHECK ---
    if len(clean_text) < 50: # 50 chars is a good threshold for meaningful content
        continue
        
    if clean_text in previous_clean_text:
        continue
        
    processed_docs.append({
        "text": clean_text,
        "metadata": metadata
    })
    
    previous_clean_text = clean_text

In [None]:
import uuid
from langchain_ollama.embeddings import OllamaEmbeddings
from qdrant_client.http.models import PointStruct

points = []
print("Loading model...")
ollama_embedding = OllamaEmbeddings(
    model=os.getenv("OLLAMA_EMBEDDING_MODEL"),
    temperature=0
)

for i, doc in enumerate(processed_docs):
    print(f"Document {i+1}/{len(processed_docs)}")
    text = doc["text"]
    metadata = doc["metadata"]

    embedding = ollama_embedding.embed_query(text)
    point_id = str(uuid.uuid4())
    points.append(
        PointStruct(
            id=point_id,
            vector=embedding,
            payload=metadata
        )
    )

Loading model...
Document 1/57
Document 2/57
Document 3/57
Document 4/57
Document 5/57
Document 6/57
Document 7/57
Document 8/57
Document 9/57
Document 10/57
Document 11/57
Document 12/57
Document 13/57
Document 14/57
Document 15/57
Document 16/57
Document 17/57
Document 18/57
Document 19/57
Document 20/57
Document 21/57
Document 22/57
Document 23/57
Document 24/57
Document 25/57
Document 26/57
Document 27/57
Document 28/57
Document 29/57
Document 30/57
Document 31/57
Document 32/57
Document 33/57
Document 34/57
Document 35/57
Document 36/57
Document 37/57
Document 38/57
Document 39/57
Document 40/57
Document 41/57
Document 42/57
Document 43/57
Document 44/57
Document 45/57
Document 46/57
Document 47/57
Document 48/57
Document 49/57
Document 50/57
Document 51/57
Document 52/57
Document 53/57
Document 54/57
Document 55/57
Document 56/57
Document 57/57


In [None]:
qdrant_client.upsert(
    collection_name=collection_name,
    points=points,
    wait=True, 
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)