Merge pull request #259 from pyupio/enhancement/limit_loaded_content

Enhancement/limit loaded content
pyupio · Jan 16, 2023 · e022850 · e022850
2 parents 39fd77b + 8a4e185
commit e022850
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 43 deletions.
diff --git a/changelogs/changelogs.py b/changelogs/changelogs.py
@@ -16,6 +16,8 @@
 
 GITHUB_API_TOKEN = os.environ.get("CHANGELOGS_GITHUB_API_TOKEN", False)
 
+CHARS_LIMIT = os.environ.get("CHARS_LIMIT", 2000000)  # default is 2 MB of 1 byte/char
+
 
 def _load_custom_functions(vendor, name):
     """
@@ -147,12 +149,13 @@ def check_switch_vendor(old_vendor, name, urls, _depth=0):
     return "", ""
 
 
-def get(name, vendor="pypi", functions={}, _depth=0):
+def get(name, vendor="pypi", functions={}, chars_limit=CHARS_LIMIT, _depth=0):
     """
     Tries to find a changelog for the given package.
     :param name: str, package name
     :param vendor: str, vendor
     :param functions: dict, custom functions
+    :param chars_limit: int, changelog content entry chars limit
     :return: dict, changelog
     """
     fns = _bootstrap_functions(name=name, vendor=vendor, functions=functions)
@@ -173,7 +176,7 @@ def get(name, vendor="pypi", functions={}, _depth=0):
     )
 
     # load the content from the given urls and parse the changelog
-    content = fns["get_content"](session=session, urls=urls)
+    content = fns["get_content"](session=session, urls=urls, chars_limit=chars_limit)
     changelog = fns["parse"](
         name=name,
         content=content,
@@ -217,14 +220,36 @@ def get_commit_log(name, vendor='pypi', functions={}, _depth=0):
     )
 
 
-def get_content(session, urls):
+def get_limited_content_entry(session, url, chars_limit):
+    """
+    Loads the content for an URL entry till chars_limit.
+    :param session: requests Session instance
+    :param url: str URL
+    :param chars_limit: int, changelog content entry chars limit
+    :return: str, limited content
+    """
+    limited_content = ""
+    with session.get(url, stream=True) as resp:
+        if resp.status_code == 200:
+            try:
+                # Avoid https://github.com/psf/requests/issues/3359
+                if not resp.encoding:
+                    resp.encoding = 'utf-8'
+                limited_content = resp.iter_content(chunk_size=chars_limit, 
+                                                    decode_unicode=True).__next__()
+            except StopIteration:
+                pass
+    return limited_content
+
+
+def get_content(session, urls, chars_limit):
     """
     Loads the content from URLs, ignoring connection errors.
     :param session: requests Session instance
     :param urls: list, str URLs
+    :param chars_limit: int, changelog content entry chars limit
     :return: str, content
     """
-
     content = ""
     for url in urls:
         try:
@@ -235,7 +260,6 @@ def get_content(session, urls):
                     logger.warning("Fetching release pages requires CHANGELOGS_GITHUB_API_TOKEN "
                                    "to be set")
                     continue
-
                 page = 0
                 exist_pages = True
                 headers = {
@@ -245,17 +269,22 @@ def get_content(session, urls):
                 while exist_pages:
                     resp = session.get(url, headers=headers, params={'page': page})
                     if resp.status_code == 200 and len(resp.json()) > 0:
+                        # These entries are limited by GH to max 125 kB
                         for item in resp.json():
                             if 'tag_name' in item and 'body' in item:
                                 content += "\n\n{}\n{}".format(item['tag_name'], item["body"])
                     else:
                         exist_pages = False
 
                     page += 1
+
             else:
-                resp = session.get(url)
-                if resp.status_code == 200:
-                    content += "\n\n" + resp.text
+                content += "\n\n" + get_limited_content_entry(session, url, chars_limit)
+
+            # To avoid exceeding the content limit by accumulation
+            if len(content) > chars_limit:
+                break   
+
         except requests.ConnectionError:
             pass
     return content

diff --git a/changelogs/custom/pypi/newrelic.py b/changelogs/custom/pypi/newrelic.py
@@ -1,4 +1,5 @@
 from lxml import etree
+from changelogs.changelogs import get_limited_content_entry
 import sys
 
 
@@ -12,25 +13,27 @@ def get_urls(releases, **kwargs):
     return urls, set()
 
 
-def get_content(session, urls):
+def get_content(session, urls, chars_limit):
     log = ""
     for url in urls:
-        r = session.get(url)
-        if r.status_code == 200:
-            root = etree.HTML(r.content)
-            try:
-                article = root.xpath("//article/div[@class='content']")[0]
-                content = etree.tostring(article, method="text", encoding='utf-8')
-                if sys.version_info > (3, 0):
-                    content = content.decode("utf-8")
-                # remove first two lines
-                content = '\n'.join(content.split('\n')[2:-1])
-                log += "# {version}\n{content}\n\n".format(
-                    version=url.split("-")[-1],
-                    content=content,
+        limited_content_entry = get_limited_content_entry(session, url, chars_limit)
+        if limited_content_entry:
+            root = etree.HTML(limited_content_entry)
+        else:
+            continue
+        try:
+            article = root.xpath("//article/div[@class='content']")[0]
+            content = etree.tostring(article, method="text", encoding='utf-8')
+            if sys.version_info > (3, 0):
+                content = content.decode("utf-8")
+            # remove first two lines
+            content = '\n'.join(content.split('\n')[2:-1])
+            log += "# {version}\n{content}\n\n".format(
+                version=url.split("-")[-1],
+                content=content,
                 )
-            except IndexError:
-                pass
+        except IndexError:
+            pass
     return log
 
 

diff --git a/changelogs/custom/pypi/pbr.py b/changelogs/custom/pypi/pbr.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 from lxml import etree
+from changelogs.changelogs import get_limited_content_entry
 
 
 def get_urls(releases, **kwargs):
@@ -8,18 +9,20 @@ def get_urls(releases, **kwargs):
     }, set()
 
 
-def get_content(session, urls):
+def get_content(session, urls, chars_limit):
     log = ""
     for url in urls:
-        r = session.get(url)
-        if r.status_code == 200:
-            root = etree.HTML(r.content)
-            for item in root.xpath("//div[@class='section']"):
-                try:
-                    log += "{version}\n{content}\n\n".format(
-                        version=item.xpath("h3/text()")[0],
-                        content="\n".join(["- {}".format(li) for li in item.xpath("ul/li/text()")])
+        limited_content_entry = get_limited_content_entry(session, url, chars_limit)
+        if limited_content_entry:
+            root = etree.HTML(limited_content_entry)
+        else:
+            continue
+        for item in root.xpath("//div[@class='section']"):
+            try:
+                log += "{version}\n{content}\n\n".format(
+                    version=item.xpath("h3/text()")[0],
+                    content="\n".join(["- {}".format(li) for li in item.xpath("ul/li/text()")])
                     )
-                except IndexError:
-                    pass
+            except IndexError:
+                pass
     return log
diff --git a/changelogs/custom/pypi/uwsgi.py b/changelogs/custom/pypi/uwsgi.py
@@ -1,4 +1,5 @@
 import requests
+from changelogs.changelogs import get_limited_content_entry
 
 
 def get_urls(releases, **kwargs):
@@ -10,14 +11,13 @@ def get_urls(releases, **kwargs):
     return urls, set()
 
 
-def get_content(session, urls):
+def get_content(session, urls, chars_limit):
     content = {}
     for url in urls:
         v = url.rsplit("-", 1)[1].rsplit(".", 1)[0]
         try:
-            resp = session.get(url)
-            if resp.status_code == 200:
-                content[v] = resp.text.split("\n", 2)[-1]
+            resp_text = get_limited_content_entry(session, url, chars_limit)
+            content[v] = resp_text.split("\n", 2)[-1]
         except requests.ConnectionError:
             pass
     return content

diff --git a/changelogs/launchpad.py b/changelogs/launchpad.py
@@ -52,17 +52,19 @@ def get_urls(session, name, data, find_changelogs_fn, **kwargs):
     return {"https://api.launchpad.net/1.0/{}/releases".format(name)}, set()
 
 
-def get_content(session, urls):
+def get_content(session, urls, chars_limit):
     """
     Loads the content from URLs, ignoring connection errors.
     :param session: requests Session instance
     :param urls: list, str URLs
+    :param chars_limit: int, changelog content entry chars limit
     :return: str, content
     """
     for url in urls:
-        resp = session.get(url)
-        if resp.ok:
-            return resp.json()
+        with session.get(url, stream=True) as resp:
+            # Avoid parsing if exceeds the limit as slicing would break the Json
+            if resp.ok and int(resp.headers['content-length']) < chars_limit:
+                return resp.json()
     return {}
 
 

diff --git a/tests/test_changelogs.py b/tests/test_changelogs.py
@@ -21,6 +21,11 @@ def session():
         return betamax_session
     monkeypatch.setattr("changelogs.changelogs.Session", session)
 
+
+def test_get_max_chars(): 
+    log = changelogs.get("requests", chars_limit=200)
+    assert len(str(log)) < 200*4  # Streamed chars after enconding may be as big as 4 bytes   
+
 
 def test_django_registration_redux():
     log = changelogs.get("django-registration-redux")

diff --git a/vcr/cassettes/tests.test_changelogs.test_get_max_chars.json b/vcr/cassettes/tests.test_changelogs.test_get_max_chars.json