From 3c10d83caf7e90105ac5eb032e7bf5e85302084b Mon Sep 17 00:00:00 2001 From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:35:41 +0530 Subject: [PATCH 1/2] Npm update (#381) * remove unwanted file * updated changes * fixed requested changes * fixed issue * service workflow implementation without calling service endpoints * fixed requested changes * fixed issues * protocol related requested changes * fixed requested changes * update time tracking * added time tracking and reloacate input guardrail before toolclassifiier * fixed issue * fixed issue * added hybrid search for the service detection * update tool classifier * fixing merge conflicts * fixed issue * optimize first user query response generation time * fixed pr reviewed issues * service integration * context based response generation flow * fixed pr review suggested issues * removed service project layer * fixed issues * delete unnessary files * added requested changes * update GUI/Dockerfile.dev to lock package.json --------- Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com> --- GUI/Dockerfile.dev | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GUI/Dockerfile.dev b/GUI/Dockerfile.dev index 48b7890e..613b6bc9 100644 --- a/GUI/Dockerfile.dev +++ b/GUI/Dockerfile.dev @@ -1,9 +1,9 @@ FROM node:22.0.0-alpine AS image WORKDIR /app -COPY ./package.json . +COPY ./package.json ./package-lock.json ./ FROM image AS build -RUN npm install --legacy-peer-deps --mode=development +RUN npm ci --legacy-peer-deps COPY . . RUN ./node_modules/.bin/vite build --mode=development From 35c4227fec036d654a704cefd45867806caa888a Mon Sep 17 00:00:00 2001 From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:36:46 +0530 Subject: [PATCH 2/2] [Bug]Fix issue in prompt config max character limit and source url validation (#384) * remove unwanted file * updated changes * fixed requested changes * fixed issue * service workflow implementation without calling service endpoints * fixed requested changes * fixed issues * protocol related requested changes * fixed requested changes * update time tracking * added time tracking and reloacate input guardrail before toolclassifiier * fixed issue * fixed issue * added hybrid search for the service detection * update tool classifier * fixing merge conflicts * fixed issue * optimize first user query response generation time * fixed pr reviewed issues * service integration * context based response generation flow * fixed pr review suggested issues * removed service project layer * fixed issues * delete unnessary files * added requested changes * Fixed issue in prompt config max character limit and source url validation * fixed formating issue --------- Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com> --- GUI/src/pages/PromptConfigurations/index.tsx | 3 +- GUI/src/pages/TestModel/index.tsx | 1 + src/vector_indexer/document_loader.py | 59 +++++++++++++++----- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/GUI/src/pages/PromptConfigurations/index.tsx b/GUI/src/pages/PromptConfigurations/index.tsx index 0c7b5113..3baf6b8e 100644 --- a/GUI/src/pages/PromptConfigurations/index.tsx +++ b/GUI/src/pages/PromptConfigurations/index.tsx @@ -74,9 +74,10 @@ const PromptConfigurations: FC = () => { setPromptText(e.target.value)} - minRows={10} + maxRows={15} />
diff --git a/GUI/src/pages/TestModel/index.tsx b/GUI/src/pages/TestModel/index.tsx index 7ca34b77..2fc116bd 100644 --- a/GUI/src/pages/TestModel/index.tsx +++ b/GUI/src/pages/TestModel/index.tsx @@ -143,6 +143,7 @@ const TestLLM: FC = () => { label="" name="" maxLength={1000} + maxRows={15} onChange={(e) => handleChange('text', e.target.value)} showMaxLength={true} /> diff --git a/src/vector_indexer/document_loader.py b/src/vector_indexer/document_loader.py index 5558a1fc..9e03b290 100644 --- a/src/vector_indexer/document_loader.py +++ b/src/vector_indexer/document_loader.py @@ -4,6 +4,8 @@ import json from pathlib import Path from typing import List +from urllib.parse import urlparse + from loguru import logger from vector_indexer.config.config_loader import VectorIndexerConfig @@ -20,10 +22,19 @@ class DocumentLoadError(Exception): class DocumentLoader: """Handles document discovery and loading from datasets folder.""" - def __init__(self, config: VectorIndexerConfig): + def __init__(self, config: VectorIndexerConfig) -> None: self.config = config self.datasets_path = Path(config.dataset_base_path) + @staticmethod + def _is_valid_url(url: str) -> bool: + """Validate that a URL has a proper scheme and network location.""" + try: + parsed = urlparse(url) + return parsed.scheme in ("http", "https") and bool(parsed.netloc) + except Exception: + return False + def discover_all_documents(self) -> List[DocumentInfo]: """ Optimized document discovery using pathlib.glob for better performance. @@ -88,22 +99,44 @@ def discover_all_documents(self) -> List[DocumentInfo]: # Check metadata file exists metadata_file = hash_dir / self.config.metadata_file - if metadata_file.exists(): - documents.append( - DocumentInfo( - document_hash=content_hash, # Use content hash consistently - cleaned_txt_path=str(cleaned_file), - source_meta_path=str(metadata_file), - dataset_collection=collection_name, - ) + if not metadata_file.exists(): + logger.warning( + f"Skipping document in {hash_dir.name}: " + f"missing {self.config.metadata_file}" ) - logger.debug( - f"Found document: {content_hash[:12]}... in collection: {collection_name}" + continue + + # Validate source_url before accepting the document + try: + with open(metadata_file, "r", encoding="utf-8") as mf: + meta = json.load(mf) + source_url = meta.get("source_url") or "" + except Exception as e: + logger.warning( + f"Skipping document in {hash_dir.name}: " + f"failed to read metadata: {e}" ) - else: + continue + + if not self._is_valid_url(source_url): logger.warning( - f"Skipping document in {hash_dir.name}: missing {self.config.metadata_file}" + f"Skipping document in {hash_dir.name}: " + f"invalid source_url '{source_url}'" + ) + continue + + documents.append( + DocumentInfo( + document_hash=content_hash, # Use content hash consistently + cleaned_txt_path=str(cleaned_file), + source_meta_path=str(metadata_file), + dataset_collection=collection_name, ) + ) + logger.debug( + f"Found document: {content_hash[:12]}... " + f"in collection: {collection_name}" + ) logger.info(f"Discovered {len(documents)} documents for processing") return documents