Skip to content

Commit

Permalink
Merge pull request #259 from pyupio/enhancement/limit_loaded_content
Browse files Browse the repository at this point in the history
Enhancement/limit loaded content
  • Loading branch information
SCH227 committed Jan 16, 2023
2 parents 39fd77b + 8a4e185 commit e022850
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 43 deletions.
45 changes: 37 additions & 8 deletions changelogs/changelogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

GITHUB_API_TOKEN = os.environ.get("CHANGELOGS_GITHUB_API_TOKEN", False)

CHARS_LIMIT = os.environ.get("CHARS_LIMIT", 2000000) # default is 2 MB of 1 byte/char


def _load_custom_functions(vendor, name):
"""
Expand Down Expand Up @@ -147,12 +149,13 @@ def check_switch_vendor(old_vendor, name, urls, _depth=0):
return "", ""


def get(name, vendor="pypi", functions={}, _depth=0):
def get(name, vendor="pypi", functions={}, chars_limit=CHARS_LIMIT, _depth=0):
"""
Tries to find a changelog for the given package.
:param name: str, package name
:param vendor: str, vendor
:param functions: dict, custom functions
:param chars_limit: int, changelog content entry chars limit
:return: dict, changelog
"""
fns = _bootstrap_functions(name=name, vendor=vendor, functions=functions)
Expand All @@ -173,7 +176,7 @@ def get(name, vendor="pypi", functions={}, _depth=0):
)

# load the content from the given urls and parse the changelog
content = fns["get_content"](session=session, urls=urls)
content = fns["get_content"](session=session, urls=urls, chars_limit=chars_limit)
changelog = fns["parse"](
name=name,
content=content,
Expand Down Expand Up @@ -217,14 +220,36 @@ def get_commit_log(name, vendor='pypi', functions={}, _depth=0):
)


def get_content(session, urls):
def get_limited_content_entry(session, url, chars_limit):
"""
Loads the content for an URL entry till chars_limit.
:param session: requests Session instance
:param url: str URL
:param chars_limit: int, changelog content entry chars limit
:return: str, limited content
"""
limited_content = ""
with session.get(url, stream=True) as resp:
if resp.status_code == 200:
try:
# Avoid https://github.com/psf/requests/issues/3359
if not resp.encoding:
resp.encoding = 'utf-8'
limited_content = resp.iter_content(chunk_size=chars_limit,
decode_unicode=True).__next__()
except StopIteration:
pass
return limited_content


def get_content(session, urls, chars_limit):
"""
Loads the content from URLs, ignoring connection errors.
:param session: requests Session instance
:param urls: list, str URLs
:param chars_limit: int, changelog content entry chars limit
:return: str, content
"""

content = ""
for url in urls:
try:
Expand All @@ -235,7 +260,6 @@ def get_content(session, urls):
logger.warning("Fetching release pages requires CHANGELOGS_GITHUB_API_TOKEN "
"to be set")
continue

page = 0
exist_pages = True
headers = {
Expand All @@ -245,17 +269,22 @@ def get_content(session, urls):
while exist_pages:
resp = session.get(url, headers=headers, params={'page': page})
if resp.status_code == 200 and len(resp.json()) > 0:
# These entries are limited by GH to max 125 kB
for item in resp.json():
if 'tag_name' in item and 'body' in item:
content += "\n\n{}\n{}".format(item['tag_name'], item["body"])
else:
exist_pages = False

page += 1

else:
resp = session.get(url)
if resp.status_code == 200:
content += "\n\n" + resp.text
content += "\n\n" + get_limited_content_entry(session, url, chars_limit)

# To avoid exceeding the content limit by accumulation
if len(content) > chars_limit:
break

except requests.ConnectionError:
pass
return content
Expand Down
35 changes: 19 additions & 16 deletions changelogs/custom/pypi/newrelic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from lxml import etree
from changelogs.changelogs import get_limited_content_entry
import sys


Expand All @@ -12,25 +13,27 @@ def get_urls(releases, **kwargs):
return urls, set()


def get_content(session, urls):
def get_content(session, urls, chars_limit):
log = ""
for url in urls:
r = session.get(url)
if r.status_code == 200:
root = etree.HTML(r.content)
try:
article = root.xpath("//article/div[@class='content']")[0]
content = etree.tostring(article, method="text", encoding='utf-8')
if sys.version_info > (3, 0):
content = content.decode("utf-8")
# remove first two lines
content = '\n'.join(content.split('\n')[2:-1])
log += "# {version}\n{content}\n\n".format(
version=url.split("-")[-1],
content=content,
limited_content_entry = get_limited_content_entry(session, url, chars_limit)
if limited_content_entry:
root = etree.HTML(limited_content_entry)
else:
continue
try:
article = root.xpath("//article/div[@class='content']")[0]
content = etree.tostring(article, method="text", encoding='utf-8')
if sys.version_info > (3, 0):
content = content.decode("utf-8")
# remove first two lines
content = '\n'.join(content.split('\n')[2:-1])
log += "# {version}\n{content}\n\n".format(
version=url.split("-")[-1],
content=content,
)
except IndexError:
pass
except IndexError:
pass
return log


Expand Down
25 changes: 14 additions & 11 deletions changelogs/custom/pypi/pbr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import unicode_literals
from lxml import etree
from changelogs.changelogs import get_limited_content_entry


def get_urls(releases, **kwargs):
Expand All @@ -8,18 +9,20 @@ def get_urls(releases, **kwargs):
}, set()


def get_content(session, urls):
def get_content(session, urls, chars_limit):
log = ""
for url in urls:
r = session.get(url)
if r.status_code == 200:
root = etree.HTML(r.content)
for item in root.xpath("//div[@class='section']"):
try:
log += "{version}\n{content}\n\n".format(
version=item.xpath("h3/text()")[0],
content="\n".join(["- {}".format(li) for li in item.xpath("ul/li/text()")])
limited_content_entry = get_limited_content_entry(session, url, chars_limit)
if limited_content_entry:
root = etree.HTML(limited_content_entry)
else:
continue
for item in root.xpath("//div[@class='section']"):
try:
log += "{version}\n{content}\n\n".format(
version=item.xpath("h3/text()")[0],
content="\n".join(["- {}".format(li) for li in item.xpath("ul/li/text()")])
)
except IndexError:
pass
except IndexError:
pass
return log
8 changes: 4 additions & 4 deletions changelogs/custom/pypi/uwsgi.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests
from changelogs.changelogs import get_limited_content_entry


def get_urls(releases, **kwargs):
Expand All @@ -10,14 +11,13 @@ def get_urls(releases, **kwargs):
return urls, set()


def get_content(session, urls):
def get_content(session, urls, chars_limit):
content = {}
for url in urls:
v = url.rsplit("-", 1)[1].rsplit(".", 1)[0]
try:
resp = session.get(url)
if resp.status_code == 200:
content[v] = resp.text.split("\n", 2)[-1]
resp_text = get_limited_content_entry(session, url, chars_limit)
content[v] = resp_text.split("\n", 2)[-1]
except requests.ConnectionError:
pass
return content
Expand Down
10 changes: 6 additions & 4 deletions changelogs/launchpad.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,19 @@ def get_urls(session, name, data, find_changelogs_fn, **kwargs):
return {"https://api.launchpad.net/1.0/{}/releases".format(name)}, set()


def get_content(session, urls):
def get_content(session, urls, chars_limit):
"""
Loads the content from URLs, ignoring connection errors.
:param session: requests Session instance
:param urls: list, str URLs
:param chars_limit: int, changelog content entry chars limit
:return: str, content
"""
for url in urls:
resp = session.get(url)
if resp.ok:
return resp.json()
with session.get(url, stream=True) as resp:
# Avoid parsing if exceeds the limit as slicing would break the Json
if resp.ok and int(resp.headers['content-length']) < chars_limit:
return resp.json()
return {}


Expand Down
5 changes: 5 additions & 0 deletions tests/test_changelogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ def session():
return betamax_session
monkeypatch.setattr("changelogs.changelogs.Session", session)


def test_get_max_chars():
log = changelogs.get("requests", chars_limit=200)
assert len(str(log)) < 200*4 # Streamed chars after enconding may be as big as 4 bytes


def test_django_registration_redux():
log = changelogs.get("django-registration-redux")
Expand Down

Large diffs are not rendered by default.

0 comments on commit e022850

Please sign in to comment.