Added python_docs_urls and tests

Added a function which uses regex to search through a string for urls that match docs.python.org/...... It is similar in use to the stackoverflow_urls function, though it simply returns a list of (url) strings.
sopython · Nov 29, 2014 · 4cc6e6d · 4cc6e6d
1 parent 3aa0b0d
commit 4cc6e6d
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 7 deletions.
diff --git a/nidaba/features/_util/question.py b/nidaba/features/_util/question.py
@@ -104,19 +104,18 @@ def stackoverflow_urls(s):
              etc and the values are lists of url strings.
     """
 
-
-    flags = [re.IGNORECASE]
+    flags = re.IGNORECASE
 
     base = r'(?:https?://)?(?:www\.)?stackoverflow\.com/'
     q_string = r'({}q(?:uestions)?/(?:\d+)(?:/[\w-]+)?/?)'
     a_string = r'({}(?:questions/(?:\d+)/[\w-]*?/(?:\d+)(?:#\S+)?|a/(?:\d+)?/?(?:\d+)?))'
     c_string = r'({}q(?:uestions)?/(?:\d+)(?:/[\w-]+)?#comment(?:\d+)_(?:\d+))'
     u_string = r'({}u(?:sers)?/(?:\d+)/?(?:\w+)?)'
 
-    q_regex = re.compile(q_string.format(base), *flags)
-    a_regex = re.compile(a_string.format(base), *flags)
-    c_regex = re.compile(c_string.format(base), *flags)
-    u_regex = re.compile(u_string.format(base), *flags)
+    q_regex = re.compile(q_string.format(base), flags)
+    a_regex = re.compile(a_string.format(base), flags)
+    c_regex = re.compile(c_string.format(base), flags)
+    u_regex = re.compile(u_string.format(base), flags)
 
     # Have to be ran in a certain order to add the beginning index of the comments and answers urls.
     # Otherwise the questions url will match them accidentally. As such, we add the starting index
@@ -140,3 +139,22 @@ def stackoverflow_urls(s):
                 matches.add(m.start(0))
 
     return result
+
+
+def python_docs_urls(s):
+    """
+    Find urls that match the Python docs inside a string.
+    :param s: Input string
+    :return: List of urls
+    """
+
+    flags = re.IGNORECASE
+    pattern = r"(?:https?://)?docs.python.org/?[\d\.]*/?[\w\.]*/?[\w\.]*[\w#.]*"
+    regex = re.compile(pattern, flags)
+
+    # Consider a url at the end of a sentence. The regex will inadvertently return the full stop (period)
+    # at the end of the url, even though it isn't part of the url. As such a list comp is used to rstrip
+    # full stops from the end of the strings.
+    result = [i.rstrip('.') for i in regex.findall(s)]
+
+    return result
diff --git a/nidaba/features/test/test_questions_util.py b/nidaba/features/test/test_questions_util.py
@@ -161,4 +161,48 @@ def test_stackoverflow_urls():
     # Testing multiple questions in the same string.
     d = question.stackoverflow_urls(' '.join(questions))
 
-    assert len(d['questions']) == 5
+    assert len(d['questions']) == 5
+
+def test_python_docs_urls():
+    """
+    Test python-docs_urls function which gets urls to Python documentation from a string
+    :return: None
+    """
+
+    empty = []
+
+    assert question.python_docs_urls('') == empty  # Empty string
+    assert question.python_docs_urls("some short sentence that\nreally shouldn't match") == empty  # No url
+    assert question.python_docs_urls('I love http://www.google.co.uk') == empty  # Non-matching url
+    assert question.python_docs_urls('http://stackoverflow.com/questions/tagged/python') == empty  # Non-matching SO url
+
+    # Various urls from docs.python.org that should all match
+    urls = ['docs.python.org',
+            'https://docs.python.org/3.2',
+            'http://docs.python.org/3.3',
+            'https://docs.python.org/3/index.html',
+            'https://docs.python.org/3/whatsnew/3.4.html',
+            'https://docs.python.org/3/library/functions.html',
+            'https://docs.python.org/2.6/library/functions.html#eval',
+            'https://docs.python.org/2.7/library/csv.html#csv.Error',
+            'https://docs.python.org/3.5/',
+            'https://docs.python.org/3/library/stdtypes.html#class.__mro__']
+
+    for url in urls:
+        print(url)
+        result = question.python_docs_urls(url)
+        print(result)
+        assert result[0] == url
+
+    s = """This is going to be a very long string! I love the Python docs at https://docs.python.org. I particularly
+           like the doc for https://docs.python.org/3/library/stdtypes.html#class.__mro__. Though you should also check
+           what is new in Python 3.5 here https://docs.python.org/3.5/whatsnew/3.5.html"""
+
+    result = question.python_docs_urls(s)
+
+    urls = ['https://docs.python.org',
+            'https://docs.python.org/3/library/stdtypes.html#class.__mro__',
+            'https://docs.python.org/3.5/whatsnew/3.5.html']
+
+    for url in urls:
+        assert url in result