From 1c6d542da8d83f125db59d0fe2077daf5767ae95 Mon Sep 17 00:00:00 2001 From: regCode <18012903+regCode@users.noreply.github.com> Date: Mon, 22 Apr 2024 05:32:15 +0300 Subject: [PATCH] =?UTF-8?q?Modify=20the=20load=5Fdata=20method=20to=20avoi?= =?UTF-8?q?d=20duplicates=20and=20reduce=20the=20number=E2=80=A6=20(#12977?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../readers/web/whole_site/base.py | 37 ++++++++++--------- .../llama-index-readers-web/pyproject.toml | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py index 284bcb56a798f..2fab54fd5b0f9 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/whole_site/base.py @@ -100,29 +100,30 @@ def load_data(self, base_url: str) -> List[Document]: while urls_to_visit: current_url, depth = urls_to_visit.pop(0) - print(f"Visiting: {current_url}, {len(urls_to_visit)} left") - if depth > self.max_depth: - continue try: self.driver.get(current_url) page_content = self.extract_content() - - # links = self.driver.find_elements(By.TAG_NAME, 'a') - links = self.extract_links() - # clean all urls - links = [self.clean_url(link) for link in links] - # extract new links - links = [link for link in links if link not in added_urls] - print(f"Found {len(links)} new potential links") - for href in links: - try: - if href.startswith(self.prefix) and href not in added_urls: - urls_to_visit.append((href, depth + 1)) - added_urls.add(href) - except Exception: - continue + added_urls.add(current_url) + + next_depth = depth + 1 + if next_depth <= self.max_depth: + # links = self.driver.find_elements(By.TAG_NAME, 'a') + links = self.extract_links() + # clean all urls + links = [self.clean_url(link) for link in links] + # extract new links + links = [link for link in links if link not in added_urls] + print(f"Found {len(links)} new potential links") + + for href in links: + try: + if href.startswith(self.prefix) and href not in added_urls: + urls_to_visit.append((href, next_depth)) + added_urls.add(href) + except Exception: + continue documents.append( Document(text=page_content, extra_info={"URL": current_url}) diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index f9cf9a4620524..fe2e1b9daff9c 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -41,7 +41,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.1.9" +version = "0.1.10" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"