Skip to content

Commit

Permalink
Modify the load_data method to avoid duplicates and reduce the number… (
Browse files Browse the repository at this point in the history
  • Loading branch information
regCode committed Apr 22, 2024
1 parent da397da commit 1c6d542
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -100,29 +100,30 @@ def load_data(self, base_url: str) -> List[Document]:

while urls_to_visit:
current_url, depth = urls_to_visit.pop(0)

print(f"Visiting: {current_url}, {len(urls_to_visit)} left")

if depth > self.max_depth:
continue
try:
self.driver.get(current_url)
page_content = self.extract_content()

# links = self.driver.find_elements(By.TAG_NAME, 'a')
links = self.extract_links()
# clean all urls
links = [self.clean_url(link) for link in links]
# extract new links
links = [link for link in links if link not in added_urls]
print(f"Found {len(links)} new potential links")
for href in links:
try:
if href.startswith(self.prefix) and href not in added_urls:
urls_to_visit.append((href, depth + 1))
added_urls.add(href)
except Exception:
continue
added_urls.add(current_url)

next_depth = depth + 1
if next_depth <= self.max_depth:
# links = self.driver.find_elements(By.TAG_NAME, 'a')
links = self.extract_links()
# clean all urls
links = [self.clean_url(link) for link in links]
# extract new links
links = [link for link in links if link not in added_urls]
print(f"Found {len(links)} new potential links")

for href in links:
try:
if href.startswith(self.prefix) and href not in added_urls:
urls_to_visit.append((href, next_depth))
added_urls.add(href)
except Exception:
continue

documents.append(
Document(text=page_content, extra_info={"URL": current_url})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.1.9"
version = "0.1.10"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down

0 comments on commit 1c6d542

Please sign in to comment.