Skip to content

Commit

Permalink
Added python_docs_urls and tests
Browse files Browse the repository at this point in the history
Added a function which uses regex to search through a string for urls that match docs.python.org/...... It is similar in use to the stackoverflow_urls function, though it simply returns a list of (url) strings.
  • Loading branch information
Keiron Pizzey committed Nov 29, 2014
1 parent 3aa0b0d commit 4cc6e6d
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 7 deletions.
30 changes: 24 additions & 6 deletions nidaba/features/_util/question.py
Expand Up @@ -104,19 +104,18 @@ def stackoverflow_urls(s):
etc and the values are lists of url strings.
"""


flags = [re.IGNORECASE]
flags = re.IGNORECASE

base = r'(?:https?://)?(?:www\.)?stackoverflow\.com/'
q_string = r'({}q(?:uestions)?/(?:\d+)(?:/[\w-]+)?/?)'
a_string = r'({}(?:questions/(?:\d+)/[\w-]*?/(?:\d+)(?:#\S+)?|a/(?:\d+)?/?(?:\d+)?))'
c_string = r'({}q(?:uestions)?/(?:\d+)(?:/[\w-]+)?#comment(?:\d+)_(?:\d+))'
u_string = r'({}u(?:sers)?/(?:\d+)/?(?:\w+)?)'

q_regex = re.compile(q_string.format(base), *flags)
a_regex = re.compile(a_string.format(base), *flags)
c_regex = re.compile(c_string.format(base), *flags)
u_regex = re.compile(u_string.format(base), *flags)
q_regex = re.compile(q_string.format(base), flags)
a_regex = re.compile(a_string.format(base), flags)
c_regex = re.compile(c_string.format(base), flags)
u_regex = re.compile(u_string.format(base), flags)

# Have to be ran in a certain order to add the beginning index of the comments and answers urls.
# Otherwise the questions url will match them accidentally. As such, we add the starting index
Expand All @@ -140,3 +139,22 @@ def stackoverflow_urls(s):
matches.add(m.start(0))

return result


def python_docs_urls(s):
"""
Find urls that match the Python docs inside a string.
:param s: Input string
:return: List of urls
"""

flags = re.IGNORECASE
pattern = r"(?:https?://)?docs.python.org/?[\d\.]*/?[\w\.]*/?[\w\.]*[\w#.]*"
regex = re.compile(pattern, flags)

# Consider a url at the end of a sentence. The regex will inadvertently return the full stop (period)
# at the end of the url, even though it isn't part of the url. As such a list comp is used to rstrip
# full stops from the end of the strings.
result = [i.rstrip('.') for i in regex.findall(s)]

return result
46 changes: 45 additions & 1 deletion nidaba/features/test/test_questions_util.py
Expand Up @@ -161,4 +161,48 @@ def test_stackoverflow_urls():
# Testing multiple questions in the same string.
d = question.stackoverflow_urls(' '.join(questions))

assert len(d['questions']) == 5
assert len(d['questions']) == 5

def test_python_docs_urls():
"""
Test python-docs_urls function which gets urls to Python documentation from a string
:return: None
"""

empty = []

assert question.python_docs_urls('') == empty # Empty string
assert question.python_docs_urls("some short sentence that\nreally shouldn't match") == empty # No url
assert question.python_docs_urls('I love http://www.google.co.uk') == empty # Non-matching url
assert question.python_docs_urls('http://stackoverflow.com/questions/tagged/python') == empty # Non-matching SO url

# Various urls from docs.python.org that should all match
urls = ['docs.python.org',
'https://docs.python.org/3.2',
'http://docs.python.org/3.3',
'https://docs.python.org/3/index.html',
'https://docs.python.org/3/whatsnew/3.4.html',
'https://docs.python.org/3/library/functions.html',
'https://docs.python.org/2.6/library/functions.html#eval',
'https://docs.python.org/2.7/library/csv.html#csv.Error',
'https://docs.python.org/3.5/',
'https://docs.python.org/3/library/stdtypes.html#class.__mro__']

for url in urls:
print(url)
result = question.python_docs_urls(url)
print(result)
assert result[0] == url

s = """This is going to be a very long string! I love the Python docs at https://docs.python.org. I particularly
like the doc for https://docs.python.org/3/library/stdtypes.html#class.__mro__. Though you should also check
what is new in Python 3.5 here https://docs.python.org/3.5/whatsnew/3.5.html"""

result = question.python_docs_urls(s)

urls = ['https://docs.python.org',
'https://docs.python.org/3/library/stdtypes.html#class.__mro__',
'https://docs.python.org/3.5/whatsnew/3.5.html']

for url in urls:
assert url in result

0 comments on commit 4cc6e6d

Please sign in to comment.