Skip to content

Commit

Permalink
Adds no SSL option to Trafilatura Web Loader (#14290)
Browse files Browse the repository at this point in the history
* add no_ssl option to Trafilatura web loader

* vbump

---------

Co-authored-by: Andrei Fajardo <andrei@nerdai.io>
  • Loading branch information
jonathanhliu21 and nerdai committed Jun 22, 2024
1 parent bc21df2 commit 40600da
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def load_data(
include_formatting=False,
include_links=False,
show_progress=False,
no_ssl=False,
**kwargs,
) -> List[Document]:
"""Load data from the urls.
Expand All @@ -42,6 +43,7 @@ def load_data(
include_formatting (bool, optional): Include formatting in the output. Defaults to False.
include_links (bool, optional): Include links in the output. Defaults to False.
show_progress (bool, optional): Show progress bar. Defaults to False
no_ssl (bool, optional): Bypass SSL verification. Defaults to False.
kwargs: Additional keyword arguments for the `trafilatura.extract` function.
Returns:
Expand All @@ -61,7 +63,7 @@ def load_data(
else:
iterator = urls
for url in iterator:
downloaded = trafilatura.fetch_url(url)
downloaded = trafilatura.fetch_url(url, no_ssl=no_ssl)
response = trafilatura.extract(
downloaded,
include_comments=include_comments,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.1.18"
version = "0.1.19"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down

0 comments on commit 40600da

Please sign in to comment.