From 3c10d83caf7e90105ac5eb032e7bf5e85302084b Mon Sep 17 00:00:00 2001
From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:35:41 +0530
Subject: [PATCH 1/2] Npm update (#381)

* remove unwanted file

* updated changes

* fixed requested changes

* fixed issue

* service workflow implementation without calling service endpoints

* fixed requested changes

* fixed issues

* protocol related requested changes

* fixed requested changes

* update time tracking

* added time tracking and reloacate input guardrail before toolclassifiier

* fixed issue

* fixed issue

* added hybrid search for the service detection

* update tool classifier

* fixing merge conflicts

* fixed issue

* optimize first user query response generation time

* fixed pr reviewed issues

* service integration

* context based response generation flow

* fixed pr review suggested issues

* removed service project layer

* fixed issues

* delete unnessary files

* added requested changes

* update GUI/Dockerfile.dev to lock package.json

---------

Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com>
---
 GUI/Dockerfile.dev | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/GUI/Dockerfile.dev b/GUI/Dockerfile.dev
index 48b7890e..613b6bc9 100644
--- a/GUI/Dockerfile.dev
+++ b/GUI/Dockerfile.dev
@@ -1,9 +1,9 @@
 FROM node:22.0.0-alpine AS image
 WORKDIR /app
-COPY ./package.json .
+COPY ./package.json ./package-lock.json ./
 
 FROM image AS build
-RUN npm install --legacy-peer-deps --mode=development
+RUN npm ci --legacy-peer-deps
 COPY . .
 RUN ./node_modules/.bin/vite build --mode=development
 

From 35c4227fec036d654a704cefd45867806caa888a Mon Sep 17 00:00:00 2001
From: Charith Nuwan Bimsara <59943919+nuwangeek@users.noreply.github.com>
Date: Wed, 8 Apr 2026 16:36:46 +0530
Subject: [PATCH 2/2] [Bug]Fix issue in prompt config max character limit and
 source url validation (#384)

* remove unwanted file

* updated changes

* fixed requested changes

* fixed issue

* service workflow implementation without calling service endpoints

* fixed requested changes

* fixed issues

* protocol related requested changes

* fixed requested changes

* update time tracking

* added time tracking and reloacate input guardrail before toolclassifiier

* fixed issue

* fixed issue

* added hybrid search for the service detection

* update tool classifier

* fixing merge conflicts

* fixed issue

* optimize first user query response generation time

* fixed pr reviewed issues

* service integration

* context based response generation flow

* fixed pr review suggested issues

* removed service project layer

* fixed issues

* delete unnessary files

* added requested changes

* Fixed issue in prompt config max character limit and source url validation

* fixed formating issue

---------

Co-authored-by: Thiru Dinesh <56014038+Thirunayan22@users.noreply.github.com>
---
 GUI/src/pages/PromptConfigurations/index.tsx |  3 +-
 GUI/src/pages/TestModel/index.tsx            |  1 +
 src/vector_indexer/document_loader.py        | 59 +++++++++++++++-----
 3 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/GUI/src/pages/PromptConfigurations/index.tsx b/GUI/src/pages/PromptConfigurations/index.tsx
index 0c7b5113..3baf6b8e 100644
--- a/GUI/src/pages/PromptConfigurations/index.tsx
+++ b/GUI/src/pages/PromptConfigurations/index.tsx
@@ -74,9 +74,10 @@ const PromptConfigurations: FC = () => {
                     <FormTextarea
                         label={t('promptConfigurations.promptLabel')}
                         name="promptText"
+                        maxLength={10000}
                         value={promptText}
                         onChange={(e) => setPromptText(e.target.value)}
-                        minRows={10}
+                        maxRows={15}
                     />
 
                     <div className="form-actions">
diff --git a/GUI/src/pages/TestModel/index.tsx b/GUI/src/pages/TestModel/index.tsx
index 7ca34b77..2fc116bd 100644
--- a/GUI/src/pages/TestModel/index.tsx
+++ b/GUI/src/pages/TestModel/index.tsx
@@ -143,6 +143,7 @@ const TestLLM: FC = () => {
               label=""
               name=""
               maxLength={1000}
+              maxRows={15}
               onChange={(e) => handleChange('text', e.target.value)}
               showMaxLength={true}
             />
diff --git a/src/vector_indexer/document_loader.py b/src/vector_indexer/document_loader.py
index 5558a1fc..9e03b290 100644
--- a/src/vector_indexer/document_loader.py
+++ b/src/vector_indexer/document_loader.py
@@ -4,6 +4,8 @@
 import json
 from pathlib import Path
 from typing import List
+from urllib.parse import urlparse
+
 from loguru import logger
 
 from vector_indexer.config.config_loader import VectorIndexerConfig
@@ -20,10 +22,19 @@ class DocumentLoadError(Exception):
 class DocumentLoader:
     """Handles document discovery and loading from datasets folder."""
 
-    def __init__(self, config: VectorIndexerConfig):
+    def __init__(self, config: VectorIndexerConfig) -> None:
         self.config = config
         self.datasets_path = Path(config.dataset_base_path)
 
+    @staticmethod
+    def _is_valid_url(url: str) -> bool:
+        """Validate that a URL has a proper scheme and network location."""
+        try:
+            parsed = urlparse(url)
+            return parsed.scheme in ("http", "https") and bool(parsed.netloc)
+        except Exception:
+            return False
+
     def discover_all_documents(self) -> List[DocumentInfo]:
         """
         Optimized document discovery using pathlib.glob for better performance.
@@ -88,22 +99,44 @@ def discover_all_documents(self) -> List[DocumentInfo]:
 
             # Check metadata file exists
             metadata_file = hash_dir / self.config.metadata_file
-            if metadata_file.exists():
-                documents.append(
-                    DocumentInfo(
-                        document_hash=content_hash,  # Use content hash consistently
-                        cleaned_txt_path=str(cleaned_file),
-                        source_meta_path=str(metadata_file),
-                        dataset_collection=collection_name,
-                    )
+            if not metadata_file.exists():
+                logger.warning(
+                    f"Skipping document in {hash_dir.name}: "
+                    f"missing {self.config.metadata_file}"
                 )
-                logger.debug(
-                    f"Found document: {content_hash[:12]}... in collection: {collection_name}"
+                continue
+
+            # Validate source_url before accepting the document
+            try:
+                with open(metadata_file, "r", encoding="utf-8") as mf:
+                    meta = json.load(mf)
+                source_url = meta.get("source_url") or ""
+            except Exception as e:
+                logger.warning(
+                    f"Skipping document in {hash_dir.name}: "
+                    f"failed to read metadata: {e}"
                 )
-            else:
+                continue
+
+            if not self._is_valid_url(source_url):
                 logger.warning(
-                    f"Skipping document in {hash_dir.name}: missing {self.config.metadata_file}"
+                    f"Skipping document in {hash_dir.name}: "
+                    f"invalid source_url '{source_url}'"
+                )
+                continue
+
+            documents.append(
+                DocumentInfo(
+                    document_hash=content_hash,  # Use content hash consistently
+                    cleaned_txt_path=str(cleaned_file),
+                    source_meta_path=str(metadata_file),
+                    dataset_collection=collection_name,
                 )
+            )
+            logger.debug(
+                f"Found document: {content_hash[:12]}... "
+                f"in collection: {collection_name}"
+            )
 
         logger.info(f"Discovered {len(documents)} documents for processing")
         return documents