diff --git a/linehaul/events/parser.py b/linehaul/events/parser.py index 7c249d5..47d3504 100644 --- a/linehaul/events/parser.py +++ b/linehaul/events/parser.py @@ -25,7 +25,7 @@ from pyparsing import printables as _printables, restOfLine from pyparsing import ParseException -from linehaul.ua import parser as user_agents +from linehaul.ua import UserAgent, parser as user_agents logger = logging.getLogger(__name__) @@ -152,7 +152,7 @@ class Download: tls_protocol = attr.ib(type=Optional[str], default=None) tls_cipher = attr.ib(type=Optional[str], default=None) country_code = attr.ib(type=Optional[str], default=None) - details = attr.ib(type=Optional[user_agents.UserAgent], default=None) + details = attr.ib(type=Optional[UserAgent], default=None) def _value_or_none(value): diff --git a/linehaul/ua/__init__.py b/linehaul/ua/__init__.py index 164f68b..7beff50 100644 --- a/linehaul/ua/__init__.py +++ b/linehaul/ua/__init__.py @@ -9,3 +9,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from linehaul.ua.datastructures import UserAgent + + +__all__ = ["UserAgent"] diff --git a/linehaul/ua/datastructures.py b/linehaul/ua/datastructures.py new file mode 100644 index 0000000..8d93244 --- /dev/null +++ b/linehaul/ua/datastructures.py @@ -0,0 +1,65 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import attr + + +@attr.s(slots=True, frozen=True) +class Installer: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class Implementation: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class LibC: + + lib = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class Distro: + + name = attr.ib(type=Optional[str], default=None) + version = attr.ib(type=Optional[str], default=None) + id = attr.ib(type=Optional[str], default=None) + libc = attr.ib(type=Optional[LibC], default=None) + + +@attr.s(slots=True, frozen=True) +class System: + + name = attr.ib(type=Optional[str], default=None) + release = attr.ib(type=Optional[str], default=None) + + +@attr.s(slots=True, frozen=True) +class UserAgent: + + installer = attr.ib(type=Optional[Installer], default=None) + python = attr.ib(type=Optional[str], default=None) + implementation = attr.ib(type=Optional[Implementation], default=None) + distro = attr.ib(type=Optional[Distro], default=None) + system = attr.ib(type=Optional[System], default=None) + cpu = attr.ib(type=Optional[str], default=None) + openssl_version = attr.ib(type=Optional[str], default=None) + setuptools_version = attr.ib(type=Optional[str], default=None) diff --git a/linehaul/ua/impl.py b/linehaul/ua/impl.py new file mode 100644 index 0000000..68cbffb --- /dev/null +++ b/linehaul/ua/impl.py @@ -0,0 +1,133 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import logging +import re + + +logger = logging.getLogger(__name__) + + +class UnableToParse(Exception): + pass + + +class UserAgentParser(metaclass=abc.ABCMeta): + @property + @abc.abstractmethod + def name(self): + """ + Returns the name of this parser, useful for things like logging etc. + """ + + @abc.abstractmethod + def __call__(self, ua): + """ + Actually parses the user agent, returning a dictionary containing all of the + relevant parsed information. If this method is unable to parse the user agent + then it can raise a ``UnableToParse`` exception to indicate that it can't parse + the given UA. + """ + + +class CallbackUserAgentParser(UserAgentParser): + def __init__(self, callback, *, name=None): + if name is None: + name = callback.__name__ + + self._callback = callback + self._name = name + + @property + def name(self): + return self._name + + def __call__(self, ua): + return self._callback(ua) + + +def ua_parser(fn): + return CallbackUserAgentParser(fn) + + +class RegexUserAgentParser(UserAgentParser): + def __init__(self, regexes, handler, *, name=None): + if name is None: + name = handler.__name__ + + self._regexes = [ + re.compile(regex) if isinstance(regex, str) else regex for regex in regexes + ] + self._handler = handler + self._name = name + + @property + def name(self): + return self._name + + def __call__(self, user_agent): + for regex in self._regexes: + matched = regex.search(user_agent) + + # If we've matched this particuar regex, then we'll break the loop here and + # go onto finishing parsing. + if matched is not None: + break + else: + # None of our regexes matched. + raise UnableToParse + + # We need to build up the args, and kwargs of our function, we call any unnamed + # group an arg, and pass them in, in order, and we call any named group a kwarg + # and we pass them in by name. + group_to_name = {v: k for k, v in matched.re.groupindex.items()} + args, kwargs = [], {} + for i, value in enumerate(matched.groups(), start=1): + name = group_to_name.get(i) + if name is not None: + kwargs[name] = value + else: + args.append(value) + + # Finally, we'll call our handler with our parsed arguments, and return whatever + # result it gives us. + return self._handler(*args, **kwargs) + + +def regex_ua_parser(*regexes): + def deco(fn): + return RegexUserAgentParser(regexes, fn) + + return deco + + +class ParserSet: + def __init__(self): + self._parsers = set() + + def register(self, parser): + self._parsers.add(parser) + return parser + + def __call__(self, user_agent): + for parser in self._parsers: + try: + return parser(user_agent) + except UnableToParse: + pass + except Exception: + logger.error( + "Error parsing %r as a %s.", user_agent, parser.name, exc_info=True + ) + + raise UnableToParse diff --git a/linehaul/ua/parser.py b/linehaul/ua/parser.py index 7bf88fd..f220614 100644 --- a/linehaul/ua/parser.py +++ b/linehaul/ua/parser.py @@ -14,15 +14,14 @@ import logging import re -from typing import Optional - -import attr -import attr.validators import cattr import packaging.version from packaging.specifiers import SpecifierSet +from linehaul.ua.datastructures import UserAgent +from linehaul.ua.impl import ParserSet, UnableToParse, ua_parser, regex_ua_parser + logger = logging.getLogger(__name__) @@ -31,433 +30,283 @@ class UnknownUserAgentError(ValueError): pass -@attr.s(slots=True, frozen=True) -class Installer: - - name = attr.ib(type=Optional[str], default=None) - version = attr.ib(type=Optional[str], default=None) - - -@attr.s(slots=True, frozen=True) -class Implementation: - - name = attr.ib(type=Optional[str], default=None) - version = attr.ib(type=Optional[str], default=None) - - -@attr.s(slots=True, frozen=True) -class LibC: - - lib = attr.ib(type=Optional[str], default=None) - version = attr.ib(type=Optional[str], default=None) - - -@attr.s(slots=True, frozen=True) -class Distro: - - name = attr.ib(type=Optional[str], default=None) - version = attr.ib(type=Optional[str], default=None) - id = attr.ib(type=Optional[str], default=None) - libc = attr.ib(type=Optional[LibC], default=None) - - -@attr.s(slots=True, frozen=True) -class System: +# Note: This is a ParserSet, not a ParserList, parsers that have been registered with +# it may be called in any order. That means that all of our parsers need to be +# ordering independent. +_parser = ParserSet() - name = attr.ib(type=Optional[str], default=None) - release = attr.ib(type=Optional[str], default=None) +@_parser.register +@ua_parser +def Pip6UserAgent(user_agent): + # We're only concerned about pip user agents. + if not user_agent.startswith("pip/"): + raise UnableToParse -@attr.s(slots=True, frozen=True) -class UserAgent: + # This format was brand new in pip 6.0, so we'll need to restrict it + # to only versions of pip newer than that. + version_str = user_agent.split()[0].split("/", 1)[1] + version = packaging.version.parse(version_str) + if version not in SpecifierSet(">=6", prereleases=True): + raise UnableToParse - installer = attr.ib(type=Optional[Installer], default=None) - python = attr.ib(type=Optional[str], default=None) - implementation = attr.ib(type=Optional[Implementation], default=None) - distro = attr.ib(type=Optional[Distro], default=None) - system = attr.ib(type=Optional[System], default=None) - cpu = attr.ib(type=Optional[str], default=None) - openssl_version = attr.ib(type=Optional[str], default=None) - setuptools_version = attr.ib(type=Optional[str], default=None) + try: + return json.loads(user_agent.split(maxsplit=1)[1]) + except (json.JSONDecodeError, UnicodeDecodeError, IndexError): + raise UnableToParse from None -class Parser: - @staticmethod - def pip_6_format(user_agent): - # We're only concerned about pip user agents. - if not user_agent.startswith("pip/"): - return - - # This format was brand new in pip 6.0, so we'll need to restrict it - # to only versions of pip newer than that. - version_str = user_agent.split()[0].split("/", 1)[1] - version = packaging.version.parse(version_str) - if version not in SpecifierSet(">=6", prereleases=True): - return - - try: - return json.loads(user_agent.split(maxsplit=1)[1]) - except json.JSONDecodeError: - return - - @staticmethod - def pip_1_4_format(user_agent): - # We're only concerned about pip user agents. - if not user_agent.startswith("pip/"): - return - - # This format was brand new in pip 1.4, and went away in pip 6.0, so - # we'll need to restrict it to only versions of pip between 1.4 and 6.0 - version_str = user_agent.split()[0].split("/", 1)[1] - version = packaging.version.parse(version_str) - if version not in SpecifierSet(">=1.4,<6", prereleases=True): - return - - _, impl, system = user_agent.split(maxsplit=2) - - data = { - "installer": {"name": "pip", "version": version_str}, - "implementation": {"name": impl.split("/", 1)[0]}, - } - - if not impl.endswith("/Unknown"): - data["implementation"]["version"] = impl.split("/", 1)[1] - - if not system.startswith("Unknown/"): - data.setdefault("system", {})["name"] = system.split("/", 1)[0] - - if not system.endswith("/Unknown"): - data.setdefault("system", {})["release"] = system.split("/", 1)[1] - - if data["implementation"]["name"].lower() == "cpython" and data[ - "implementation" - ].get("version"): - data["python"] = data["implementation"]["version"] - - return data - - _distribute_re = re.compile( - r"^Python-urllib/(?P\d\.\d) distribute/(?P\S+)$" +@_parser.register +@regex_ua_parser( + ( + r"^pip/(?P\S+) (?P\S+)/(?P\S+) " + r"(?P\S+)/(?P\S+)$" ) +) +def Pip1_4UserAgent(*, version, impl_name, impl_version, system_name, system_release): + # This format was brand new in pip 1.4, and went away in pip 6.0, so + # we'll need to restrict it to only versions of pip between 1.4 and 6.0. + if version not in SpecifierSet(">=1.4,<6", prereleases=True): + raise UnableToParse - @classmethod - def distribute_format(cls, user_agent): - m = cls._distribute_re.search(user_agent) - if m is None: - return + data = {"installer": {"name": "pip", "version": version}} - return { - "installer": {"name": "distribute", "version": m.group("version")}, - "python": m.group("python"), - } + if impl_name.lower() != "unknown": + data.setdefault("implementation", {})["name"] = impl_name - _setuptools_re = re.compile( - r"^Python-urllib/(?P\d\.\d) setuptools/(?P\S+)$" - ) + if impl_version.lower() != "unknown": + data.setdefault("implementation", {})["version"] = impl_version - _setuptools_new_re = re.compile( - r"^setuptools/(?P\S+) Python-urllib/(?P\d\.\d)$" - ) + if system_name.lower() != "unknown": + data.setdefault("system", {})["name"] = system_name - @classmethod - def setuptools_format(cls, user_agent): - m = cls._setuptools_re.search(user_agent) - if m is None: - m = cls._setuptools_new_re.search(user_agent) - if m is None: - return + if system_release.lower() != "unknown": + data.setdefault("system", {})["release"] = system_release - return { - "installer": {"name": "setuptools", "version": m.group("version")}, - "python": m.group("python"), - } + if impl_name.lower() == "cpython": + data["python"] = impl_version - _pex_re = re.compile(r"pex/(?P\S+)$") + return data - @classmethod - def pex_format(cls, user_agent): - m = cls._pex_re.search(user_agent) - if m is None: - return - return {"installer": {"name": "pex", "version": m.group("version")}} +@_parser.register +@regex_ua_parser(r"^Python-urllib/(?P\d\.\d) distribute/(?P\S+)$") +def DistributeUserAgent(*, python, version): + return {"installer": {"name": "distribute", "version": version}, "python": python} - _conda_re = re.compile(r"^conda/(?P\S+)(?: .+)?$") - @classmethod - def conda_format(cls, user_agent): - m = cls._conda_re.search(user_agent) - if m is None: - return +@_parser.register +@regex_ua_parser( + r"^Python-urllib/(?P\d\.\d) setuptools/(?P\S+)$", + r"^setuptools/(?P\S+) Python-urllib/(?P\d\.\d)$", +) +def SetuptoolsUserAgent(*, python, version): + return {"installer": {"name": "setuptools", "version": version}, "python": python} - return {"installer": {"name": "conda", "version": m.group("version")}} - _bazel_re = re.compile(r"^Bazel/(?P.+)$") +@_parser.register +@regex_ua_parser(r"pex/(?P\S+)$") +def PexUserAgent(*, version): + return {"installer": {"name": "pex", "version": version}} - @classmethod - def bazel_format(cls, user_agent): - m = cls._bazel_re.search(user_agent) - if m is None: - return - version = m.group("version") - if version.startswith("release "): - version = version[8:] +@_parser.register +@regex_ua_parser(r"^conda/(?P\S+)(?: .+)?$") +def CondaUserAgent(*, version): + return {"installer": {"name": "conda", "version": version}} - return {"installer": {"name": "Bazel", "version": version}} - _bandersnatch_re = re.compile(r"^bandersnatch/(?P\S+) \(.+\)$") +@_parser.register +@regex_ua_parser(r"^Bazel/(?:release\s+)?(?P.+)$") +def BazelUserAgent(*, version): + return {"installer": {"name": "Bazel", "version": version}} - @classmethod - def bandersnatch_format(cls, user_agent): - m = cls._bandersnatch_re.search(user_agent) - if m is None: - return - return {"installer": {"name": "bandersnatch", "version": m.group("version")}} +@_parser.register +@regex_ua_parser(r"^bandersnatch/(?P\S+) \(.+\)$") +def BandersnatchUserAgent(*, version): + return {"installer": {"name": "bandersnatch", "version": version}} - _devpi_re = re.compile(r"devpi-server/(?P\S+) \(.+\)$") - @classmethod - def devpi_format(cls, user_agent): - m = cls._devpi_re.search(user_agent) - if m is None: - return +@_parser.register +@regex_ua_parser(r"devpi-server/(?P\S+) \(.+\)$") +def DevPIUserAgent(*, version): + return {"installer": {"name": "devpi", "version": version}} - return {"installer": {"name": "devpi", "version": m.group("version")}} - _z3c_pypimirror_re = re.compile(r"^z3c\.pypimirror/(?P\S+)$") +@_parser.register +@regex_ua_parser(r"^z3c\.pypimirror/(?P\S+)$") +def Z3CPyPIMirrorUserAgent(*, version): + return {"installer": {"name": "z3c.pypimirror", "version": version}} - @classmethod - def z3c_pypimirror_format(cls, user_agent): - m = cls._z3c_pypimirror_re.search(user_agent) - if m is None: - return - return {"installer": {"name": "z3c.pypimirror", "version": m.group("version")}} +@_parser.register +@regex_ua_parser(r"^Artifactory/(?P\S+)$") +def ArtifactoryUserAgent(*, version): + return {"installer": {"name": "Artifactory", "version": version}} - _artifactory_re = re.compile(r"^Artifactory/(?P\S+)$") - @classmethod - def artifactory_format(cls, user_agent): - m = cls._artifactory_re.search(user_agent) - if m is None: - return +@_parser.register +@regex_ua_parser(r"^Nexus/(?P\S+)") +def NexusUserAgent(*, version): + return {"installer": {"name": "Nexus", "version": version}} - return {"installer": {"name": "Artifactory", "version": m.group("version")}} - _nexus_re = re.compile(r"^Nexus/(?P\S+)") +@_parser.register +@regex_ua_parser(r"^pep381client(?:-proxy)?/(?P\S+)$") +def PEP381ClientUserAgent(*, version): + return {"installer": {"name": "pep381client", "version": version}} - @classmethod - def nexus_format(cls, user_agent): - m = cls._nexus_re.search(user_agent) - if m is None: - return - return {"installer": {"name": "Nexus", "version": m.group("version")}} +# TODO: We should probably consider not parsing this specially, and moving it to +# just the same as we treat browsers, since we don't really know anything +# about it-- including whether or not the version of Python mentioned is +# the one they're going to install it into or not. The one real sticking +# point is that before pip 1.4, pip just used the default urllib2 UA, so +# right now we're counting pip 1.4 in here... but pip 1.4 usage is probably +# low enough not to worry about that any more. +@_parser.register +@regex_ua_parser(r"^Python-urllib/(?P\d\.\d)$") +def URLLib2UserAgent(*, python): + return {"python": python} - _pep381client_re = re.compile(r"^pep381client(?:-proxy)?/(?P\S+)$") - @classmethod - def pep381client_format(cls, user_agent): - m = cls._pep381client_re.search(user_agent) - if m is None: - return +# TODO: We should probably consider not parsing this specially, and moving it to +# just the same as we treat browsers, since we don't really know anything +# about it and the version of requests isn't very useful in general. +@_parser.register +@regex_ua_parser(r"^python-requests/(?P\S+)(?: .+)?$") +def RequestsUserAgent(*, version): + return {"installer": {"name": "requests", "version": version}} - return {"installer": {"name": "pep381client", "version": m.group("version")}} - @staticmethod - def urllib2_format(user_agent): - # This isn't really a format exactly, prior to pip 1.4 pip used urllib2 - # and it didn't bother to change the default user agent. This means - # we'll miscount this version as higher than it actually is, however - # I'm not sure there is any better way around that. - if not user_agent.startswith("Python-urllib/"): - return - - # Some projects (like setuptools) add an additional item to the end of - # the urllib string. We want to make sure this is _only_ Python-urllib - if len(user_agent.split()) > 1: - return - - return {"python": user_agent.split("/", 1)[1]} - - _requests_re = re.compile(r"^python-requests/(?P\S+)(?: .+)?$") - - @classmethod - def requests_format(cls, user_agent): - # Older versions of devpi used requests without modifying the user - # agent. However this could also just be someone using requests to - # download things from PyPI naturally. This means we can't count this - # as anything other than requests, but it might be something else. - m = cls._requests_re.search(user_agent) - if m is None: - return - - return {"installer": {"name": "requests", "version": m.group("version")}} - - _os_re = re.compile( - r""" - (?: - ^fetch\ libfetch/\S+$ | - ^libfetch/\S+$ | - ^OpenBSD\ ftp$ | - ^Homebrew\ | - ^MacPorts/? | - ^NetBSD-ftp/ | - ^slapt-get | - ^pypi-install/ | - ^slackrepo$ | - ^PTXdist | - ^GARstow/ | - ^xbps/ - ) - """, - re.VERBOSE, +@_parser.register +@regex_ua_parser( + ( + r"^Homebrew/(?P\S+) " + r"\(Macintosh; Intel (?:Mac OS X|macOS) (?P[^)]+)\)(?: .+)?$" ) - - @classmethod - def os_format(cls, user_agent): - m = cls._os_re.search(user_agent) - if m is None: - return - - return {"installer": {"name": "OS"}} - - _homebrew_re = re.compile( +) +def HomebrewUserAgent(*, version, osx_version): + return { + "installer": {"name": "Homebrew", "version": version}, + "distro": {"name": "OS X", "version": osx_version}, + } + + +# TODO: It would be nice to maybe break more of these apart to try and get more insight +# into the OSs that people are installing packages into (similiar to Homebrew). +@_parser.register +@regex_ua_parser( + re.compile( r""" - ^ - Homebrew/(?P\S+) - \s+ - \(Macintosh;\ Intel\ Mac\ OS\ X\ (?P[^)]+)\) - """, + (?: + ^fetch\ libfetch/\S+$ | + ^libfetch/\S+$ | + ^OpenBSD\ ftp$ | + ^MacPorts/? | + ^NetBSD-ftp/ | + ^slapt-get | + ^pypi-install/ | + ^slackrepo$ | + ^PTXdist | + ^GARstow/ | + ^xbps/ + ) + """, re.VERBOSE, ) +) +def OSUserAgent(): + return {"installer": {"name": "OS"}} - @classmethod - def homebrew_format(cls, user_agent): - m = cls._homebrew_re.search(user_agent) - if m is None: - return - - return { - "installer": {"name": "Homebrew", "version": m.group("version")}, - "distro": {"name": "OS X", "version": m.group("osx_version")}, - } - _browser_re = re.compile( - r""" - ^ - (?: - Mozilla | - Safari | - wget | - curl | - Opera | - aria2 | - AndroidDownloadManager | - com\.apple\.WebKit\.Networking/ | - FDM\ \S+ | - URL/Emacs | - Firefox/ | - UCWEB | - Links | - ^okhttp | - ^Apache-HttpClient - ) - (?:/|$) - """, - re.IGNORECASE | re.VERBOSE, - ) - - @classmethod - def browser_format(cls, user_agent): - m = cls._browser_re.search(user_agent) - if m is None: - return - - return {"installer": {"name": "Browser"}} - - _ignore_re = re.compile( +@_parser.register +@regex_ua_parser( + re.compile( r""" + ^ (?: - ^Datadog\ Agent/ | - ^\(null\)$ | - ^WordPress/ | - ^Chef\ (?:Client|Knife)/ | - ^Ruby$ | - ^Slackbot-LinkExpanding | - ^TextualInlineMedia/ | - ^WeeChat/ | - ^Download\ Master$ | - ^Java/ | - ^Go\ \d\.\d\ package\ http$ | - ^Go-http-client/ | - ^GNU\ Guile$ | - ^github-olee$ | - ^YisouSpider$ | - ^Apache\ Ant/ | - ^Salt/ | - ^ansible-httpget$ | - ^ltx71\ -\ \(http://ltx71.com/\) | - ^Scrapy/ | - ^spectool/ | - Nutch | - ^AWSBrewLinkChecker/ | - ^Y!J-ASR/ | - ^NSIS_Inetc\ \(Mozilla\)$ | - ^Debian\ uscan | - ^Pingdom\.com_bot_version_\d+\.\d+_\(https?://www.pingdom.com/\)$ | - ^MauiBot\ \(crawler\.feedback\+dc@gmail\.com\)$ + Mozilla | + Safari | + wget | + curl | + Opera | + aria2 | + AndroidDownloadManager | + com\.apple\.WebKit\.Networking/ | + FDM\ \S+ | + URL/Emacs | + Firefox/ | + UCWEB | + Links | + ^okhttp | + ^Apache-HttpClient ) - """, - re.VERBOSE, + (?:/|$) + """, + re.IGNORECASE | re.VERBOSE, ) - - @classmethod - def ignored(cls, user_agent): - m = cls._ignore_re.search(user_agent) - return m is not None - - @classmethod - def parse(cls, user_agent): - formats = [ - cls.pip_6_format, - cls.pip_1_4_format, - cls.setuptools_format, - cls.distribute_format, - cls.pex_format, - cls.conda_format, - cls.bazel_format, - cls.bandersnatch_format, - cls.z3c_pypimirror_format, - cls.devpi_format, - cls.artifactory_format, - cls.nexus_format, - cls.pep381client_format, - cls.urllib2_format, - cls.requests_format, - cls.homebrew_format, - cls.os_format, - cls.browser_format, - ] - - for format in formats: - try: - data = format(user_agent) - except Exception as exc: - logger.warning( - "Error parsing %r as %s", user_agent, format.__name__, exc_info=True - ) - data = None - - if data is not None: - return cattr.structure(data, UserAgent) - - if cls.ignored(user_agent): - return - - raise UnknownUserAgentError(user_agent) - - -parse = Parser.parse +) +def BrowserUserAgent(): + return {"installer": {"name": "Browser"}} + + +# TODO: It would be kind of nice to implement this as just another parser, that returns +# None instead of a dictionary. However given that there is no inherent ordering +# in a ParserSet, and we want this to always go last (just incase an ignore +# pattern is overlly broad) we can't do that. It would be nice to make it possible +# to register a parser with an explicit location in the parser set. +_ignore_re = re.compile( + r""" + (?: + ^Datadog\ Agent/ | + ^\(null\)$ | + ^WordPress/ | + ^Chef\ (?:Client|Knife)/ | + ^Ruby$ | + ^Slackbot-LinkExpanding | + ^TextualInlineMedia/ | + ^WeeChat/ | + ^Download\ Master$ | + ^Java/ | + ^Go\ \d\.\d\ package\ http$ | + ^Go-http-client/ | + ^GNU\ Guile$ | + ^github-olee$ | + ^YisouSpider$ | + ^Apache\ Ant/ | + ^Salt/ | + ^ansible-httpget$ | + ^ltx71\ -\ \(http://ltx71.com/\) | + ^Scrapy/ | + ^spectool/ | + Nutch | + ^AWSBrewLinkChecker/ | + ^Y!J-ASR/ | + ^NSIS_Inetc\ \(Mozilla\)$ | + ^Debian\ uscan | + ^Pingdom\.com_bot_version_\d+\.\d+_\(https?://www.pingdom.com/\)$ | + ^MauiBot\ \(crawler\.feedback\+dc@gmail\.com\)$ + ) + """, + re.VERBOSE, +) + + +def parse(user_agent): + try: + return cattr.structure(_parser(user_agent), UserAgent) + except UnableToParse: + # If we were not able to parse the user agent, then we have two options, we can + # either raise an `UnknownUserAgentError` or we can return None to explicitly + # say that we opted not to parse this user agent. To determine which option we + # pick we'll match against a regex of UAs to ignore, if we match then we'll + # return a None to indicate to our caller that we couldn't parse this UA, but + # that it was an expected inability to parse. Otherwise we'll raise an + # `UnknownUserAgentError` to indicate that it as an unexpected inability to + # parse. + if _ignore_re.search(user_agent) is not None: + return None + + raise UnknownUserAgentError from None diff --git a/requirements/tests.in b/requirements/tests.in index 7ae6973..a0505ef 100644 --- a/requirements/tests.in +++ b/requirements/tests.in @@ -1,3 +1,4 @@ hypothesis pytest pytest-cov +pyyaml diff --git a/requirements/tests.txt b/requirements/tests.txt index cf4de6d..78977b9 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -7,4 +7,5 @@ pluggy==0.6.0 # via pytest py==1.5.4 # via pytest pytest-cov==2.5.1 pytest==3.6.3 +pyyaml==3.13 six==1.11.0 # via more-itertools, pytest diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..438cad5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,29 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os.path + +import pytest + + +def pytest_collection_modifyitems(items): + for item in items: + if not hasattr(item, "module"): # e.g.: DoctestTextfile + continue + + module_path = os.path.relpath( + item.module.__file__, os.path.commonprefix([__file__, item.module.__file__]) + ) + + module_root_dir = module_path.split(os.pathsep)[0] + if module_root_dir.startswith("unit"): + item.add_marker(pytest.mark.unit) diff --git a/tests/strategies.py b/tests/strategies.py index 2984800..abe6813 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -13,6 +13,9 @@ from hypothesis import strategies as st +INF = float("inf") + + @st.composite def line_delimited_data(draw, max_line_size, min_lines=1): n = draw(max_line_size) @@ -36,3 +39,151 @@ def chunked(draw, source): chunk_sizes += [len(data)] return [data[u:v] for u, v in zip(chunk_sizes, chunk_sizes[1:])] + + +def _none_for_inf(v): + if v is INF: + return None + return v + + +@st.composite +def version(draw, min_digits=1, max_digits=None, min_version=None, max_version=None): + min_version_digits = None if min_version is None else len(min_version.split(".")) + max_version_digits = None if max_version is None else len(max_version.split(".")) + + if min_digits < 1: + raise ValueError("Minimum digits must be >= 1") + if max_digits is None: + # To determine our maximum number of digits, we're going to take the larger of + # our default of 10 greater than the minimum, or the number of digits in the min + # and max versions. + max_digits = max( + filter(None, [min_digits + 10, min_version_digits, max_version_digits]) + ) + if min_digits > max_digits: + raise ValueError("Maximum digits must be greater than the minimum digits.") + if min_version_digits is not None and min_version_digits > max_digits: + raise ValueError( + "Cannot have a minimum version with more digits than the maximum number " + "of digits." + ) + if max_version_digits is not None and max_version_digits > max_digits: + raise ValueError( + "Cannot have a maximum version with more digits than the maximum number " + "of digits." + ) + + num_digits = draw(st.integers(min_value=min_digits, max_value=max_digits)) + + if min_version is not None: + min_version = [int(i) for i in min_version.split(".")] + else: + min_version = [0] + + # We need to pad out the minimum version so that it matches our number of digits. + min_version += [0 for _ in range(num_digits - len(min_version))] + + if max_version is not None: + # If we were given a max range, than we want to pad it out to zeros to match + # the number of digits we're trying to generate. + max_version = [int(i) for i in max_version.split(".")] + max_version += [0 for _ in range(num_digits - len(max_version))] + else: + # If we were not given a max range, we want to have an infinte top end. + max_version = [INF] * num_digits + + if min_version > max_version: + raise ValueError("The mininum version *MUST* be less than the maximum version.") + + # The very first version strategy we can have, is simply matching whatever the + # mininum version is. + version_strategies = [st.tuples(*[st.just(i) for i in min_version])] + + # Now we have to build up a list of possible versions besides our basic one. + while min_version: + # We're going to start with incrementing the rightmost digit in our version. + incrementing_part = min_version.pop() + + # If the number of digits we would require to handle a version that is + # larger than this mininum version is greater than the number of digits + # we're trying to generate in a version, then we'll skip it and move onto + # the next one. + # Note: We add one to this to account for the incrementing_part that we removed + # from this list earlier. + if len(min_version) + 1 > num_digits: + continue + + # We're going to be making a version that has the same prefix as min_version, + # but the incrementing part is one higher. If doing that would make the version + # number we're just about to generate greater than our maximum version, then + # we'll break out of the loop. Any further incrementing will continue to be + # too large of a version number. + if min_version + [incrementing_part + 1] > max_version[: len(min_version) + 1]: + break + + # We're going to limit our generated version by the right most digit in our + # maximum version. + max_incrementing_part = max_version[len(min_version)] + + # Build up a parts that is all of the preceding digits, sans the final + # digit, e.g. if our minimum version is 1.5.6.0, then we want 1, 5, 6. + # We know this is safe with the maximum version, becasue if it wasn't, then + # we would have bailed out earlier. + parts = [st.just(i) for i in min_version] + + # If there are any values where the incrementing part will *always* mean that + # any version number we generate, no matter what gets generated for the padded + # versions, then we'll create strategies to deal with those first. + if min_version + [incrementing_part + 1] < max_version[: len(min_version) + 1]: + # if incrementing_part + 1 < max_incrementing_part: + if ( + max_incrementing_part is INF + or min_version != max_version[: len(min_version)] + ): + max_incr_value = None + else: + max_incr_value = max_incrementing_part - 1 + subparts = [ + st.integers(min_value=incrementing_part + 1, max_value=max_incr_value) + ] + + # At this part, we know we can just blindly generate any padding we want, + # because our leading digits will ensure that we are *always* less than + # our maximum version. + # Note: We have to subtract an extra 1 from our number of needed parts to + # complete our padding, because of the one we generated above. + subparts += [ + st.integers(min_value=0) for _ in range(num_digits - len(parts) - 1) + ] + + # Now we're going to create a hypothesis tuple from our prefix parts, and + # our subparts, and add it to our list of strategies to try. + version_strategies.append(st.tuples(*parts + subparts)) + + # Finally, we will generate a strategy that sets the incrementing part and all + # padded parts maximum value to be equal to the maximum value for that part in + # our maximum value. The only special case here is that Infinity values in our + # maximum values need to be translated to None for hypothesis. We need one + # special case here, if our max_incrementing_part is inf, then this case should + # already have been handled up above. + if ( + max_incrementing_part is not INF + and min_version == max_version[: len(min_version)] + ): + parts += [st.just(max_incrementing_part)] + + parts += [ + st.integers(min_value=0, max_value=_none_for_inf(max_version[i])) + for i in range(len(parts), num_digits) + ] + + # Create a hypothesis tuple from our parts, and add it to our list of + # strategies to try. + version_strategies.append(st.tuples(*parts)) + + version = draw(st.one_of(version_strategies)) + + # Now that we have a list of version strategies, we'll draw from one of those + # possible strategies, and join the parts together to create a verison number. + return ".".join(str(i) for i in version) diff --git a/tests/test_strategies.py b/tests/test_strategies.py new file mode 100644 index 0000000..622249a --- /dev/null +++ b/tests/test_strategies.py @@ -0,0 +1,140 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +from hypothesis import assume, given, strategies as st, reproduce_failure, seed + +from .strategies import version as st_version + + +class TestVersionStrategy: + @staticmethod + def _ver_2_list(version): + version = [int(i) for i in version.split(".")] + return list( + reversed(list(itertools.dropwhile(lambda i: i == 0, reversed(version)))) + ) + + @given( + st.data(), + st.tuples( + st.integers(min_value=1), st.integers(min_value=1), st.integers(min_value=1) + ).map(lambda i: ".".join(str(j) for j in i)), + ) + def test_greater_than_minimum(self, data, min_version): + version = data.draw(st_version(min_version=min_version)) + assert self._ver_2_list(version) >= self._ver_2_list(min_version) + + @given( + st.data(), + st.tuples( + st.integers(min_value=10), + st.integers(min_value=1), + st.integers(min_value=1), + ).map(lambda i: ".".join(str(j) for j in i)), + ) + def test_less_than_maximum(self, data, max_version): + version = data.draw(st_version(max_version=max_version)) + assert self._ver_2_list(version) <= self._ver_2_list(max_version) + + @given( + st.data(), + st.tuples( + st.tuples( + st.integers(min_value=1), + st.integers(min_value=1), + st.integers(min_value=1), + ), + st.tuples( + st.integers(min_value=1), + st.integers(min_value=1), + st.integers(min_value=1), + ), + ).map(lambda inp: [".".join(str(i) for i in p) for p in sorted(inp)]), + ) + def test_inbetween_min_and_max(self, data, versions): + min_version, max_version = versions + version = data.draw( + st_version(min_version=min_version, max_version=max_version) + ) + assert ( + self._ver_2_list(min_version) + <= self._ver_2_list(version) + <= self._ver_2_list(max_version) + ) + + @given(st.data(), st.integers(min_value=1, max_value=100)) + def test_produces_with_more_digits_than_min(self, data, min_digits): + version = data.draw(st_version(min_digits=min_digits)) + assert len(version.split(".")) >= min_digits + + @given(st.data(), st.integers(min_value=2, max_value=100)) + def test_produces_with_less_digits_than_max(self, data, max_digits): + version = data.draw(st_version(max_digits=max_digits)) + assert len(version.split(".")) <= max_digits + + @given( + st.data(), + st.tuples( + st.integers(min_value=1, max_value=100), + st.integers(min_value=1, max_value=100), + ).map(lambda inp: sorted(inp)), + ) + def test_produces_inbetween_min_and_max_digits(self, data, digits): + min_digits, max_digits = digits + version = data.draw(st_version(min_digits=min_digits, max_digits=max_digits)) + assert min_digits <= len(version.split(".")) <= max_digits + + @given( + st.data(), + st.tuples( + st.tuples( + st.integers(min_value=1), + st.integers(min_value=1), + st.integers(min_value=1), + ), + st.tuples( + st.integers(min_value=1), + st.integers(min_value=1), + st.integers(min_value=1), + ), + ).map(lambda inp: [".".join(str(i) for i in p) for p in sorted(inp)]), + st.tuples( + st.integers(min_value=1, max_value=100), + st.integers(min_value=1, max_value=100), + ).map(lambda inp: sorted(inp)), + ) + def test_mixture(self, data, versions, digits): + min_version, max_version = versions + min_digits, max_digits = digits + + # Check that the our minimum version doesn't have too many digits. + # TODO: Can we remove these assumptions? + assume(len(min_version.split(".")) <= max_digits) + assume(len(max_version.split(".")) <= max_digits) + + version = data.draw( + st_version( + min_digits=min_digits, + max_digits=max_digits, + min_version=min_version, + max_version=max_version, + ) + ) + + assert ( + self._ver_2_list(min_version) + <= self._ver_2_list(version) + <= self._ver_2_list(max_version) + ) + assert min_digits <= len(version.split(".")) <= max_digits diff --git a/tests/unit/ua/fixtures/artifactory.yml b/tests/unit/ua/fixtures/artifactory.yml new file mode 100644 index 0000000..937c150 --- /dev/null +++ b/tests/unit/ua/fixtures/artifactory.yml @@ -0,0 +1,5 @@ +- ua: Artifactory/1.7.4 + result: + installer: + name: Artifactory + version: 1.7.4 diff --git a/tests/unit/ua/fixtures/bandersnatch.yml b/tests/unit/ua/fixtures/bandersnatch.yml new file mode 100644 index 0000000..a70a4f3 --- /dev/null +++ b/tests/unit/ua/fixtures/bandersnatch.yml @@ -0,0 +1,5 @@ +- ua: 'bandersnatch/2.2.1 (cpython 3.7.0-final0, Darwin x86_64)' + result: + installer: + name: bandersnatch + version: 2.2.1 diff --git a/tests/unit/ua/fixtures/bazel.yml b/tests/unit/ua/fixtures/bazel.yml new file mode 100644 index 0000000..1275b99 --- /dev/null +++ b/tests/unit/ua/fixtures/bazel.yml @@ -0,0 +1,17 @@ +- ua: Bazel/1.0 + result: + installer: + name: Bazel + version: '1.0' + +- ua: Bazel/release 1.0 + result: + installer: + name: Bazel + version: '1.0' + +- ua: Bazel/1.0 dev + result: + installer: + name: Bazel + version: 1.0 dev diff --git a/tests/unit/ua/fixtures/browser.yml b/tests/unit/ua/fixtures/browser.yml new file mode 100644 index 0000000..f5bc7b2 --- /dev/null +++ b/tests/unit/ua/fixtures/browser.yml @@ -0,0 +1,4 @@ +- ua: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36 + result: + installer: + name: Browser diff --git a/tests/unit/ua/fixtures/conda.yml b/tests/unit/ua/fixtures/conda.yml new file mode 100644 index 0000000..32036bd --- /dev/null +++ b/tests/unit/ua/fixtures/conda.yml @@ -0,0 +1,5 @@ +- ua: conda/4.3.16 + result: + installer: + name: conda + version: 4.3.16 diff --git a/tests/unit/ua/fixtures/devpi.yml b/tests/unit/ua/fixtures/devpi.yml new file mode 100644 index 0000000..cf1905f --- /dev/null +++ b/tests/unit/ua/fixtures/devpi.yml @@ -0,0 +1,5 @@ +- ua: 'devpi-server/4.6.0 (py3.7.0; darwin)' + result: + installer: + name: devpi + version: 4.6.0 diff --git a/tests/unit/ua/fixtures/distribute.yml b/tests/unit/ua/fixtures/distribute.yml new file mode 100644 index 0000000..5071c55 --- /dev/null +++ b/tests/unit/ua/fixtures/distribute.yml @@ -0,0 +1,6 @@ +- ua: Python-urllib/3.6 distribute/0.6.21 + result: + installer: + name: distribute + version: 0.6.21 + python: '3.6' diff --git a/tests/unit/ua/fixtures/homebrew.yml b/tests/unit/ua/fixtures/homebrew.yml new file mode 100644 index 0000000..0810ff1 --- /dev/null +++ b/tests/unit/ua/fixtures/homebrew.yml @@ -0,0 +1,8 @@ +- ua: Homebrew/0.9.9 (Macintosh; Intel macOS 10.12.0) curl/7.43.0 + result: + installer: + name: Homebrew + version: 0.9.9 + distro: + name: OS X + version: 10.12.0 diff --git a/tests/unit/ua/fixtures/ignored.yml b/tests/unit/ua/fixtures/ignored.yml new file mode 100644 index 0000000..dafb9f6 --- /dev/null +++ b/tests/unit/ua/fixtures/ignored.yml @@ -0,0 +1,2 @@ +- ua: (null) + result: null diff --git a/tests/unit/ua/fixtures/nexus.yml b/tests/unit/ua/fixtures/nexus.yml new file mode 100644 index 0000000..694d23b --- /dev/null +++ b/tests/unit/ua/fixtures/nexus.yml @@ -0,0 +1,5 @@ +- ua: Nexus/3.6.0-02 (OSS; Linux; 4.4.0-62-generic; amd64; 1.8.0_144) + result: + installer: + name: Nexus + version: 3.6.0-02 diff --git a/tests/unit/ua/fixtures/os.yml b/tests/unit/ua/fixtures/os.yml new file mode 100644 index 0000000..71da2b1 --- /dev/null +++ b/tests/unit/ua/fixtures/os.yml @@ -0,0 +1,4 @@ +- ua: fetch libfetch/2.0 + result: + installer: + name: OS diff --git a/tests/unit/ua/fixtures/pep381client.yml b/tests/unit/ua/fixtures/pep381client.yml new file mode 100644 index 0000000..8437c98 --- /dev/null +++ b/tests/unit/ua/fixtures/pep381client.yml @@ -0,0 +1,11 @@ +- ua: pep381client/1.0.1 + result: + installer: + name: pep381client + version: 1.0.1 + +- ua: pep381client-proxy/0.5.6 + result: + installer: + name: pep381client + version: 0.5.6 diff --git a/tests/unit/ua/fixtures/pex.yml b/tests/unit/ua/fixtures/pex.yml new file mode 100644 index 0000000..6d2c401 --- /dev/null +++ b/tests/unit/ua/fixtures/pex.yml @@ -0,0 +1,5 @@ +- ua: pex/1.4.5 + result: + installer: + name: pex + version: 1.4.5 diff --git a/tests/unit/ua/fixtures/pip.yml b/tests/unit/ua/fixtures/pip.yml new file mode 100644 index 0000000..fc1a956 --- /dev/null +++ b/tests/unit/ua/fixtures/pip.yml @@ -0,0 +1,33 @@ +# Pip 6 Format +- ua: 'pip/18.0 {"cpu":"x86_64","distro":{"name":"macOS","version":"10.13.6"},"implementation":{"name":"PyPy","version":"6.0.0"},"installer":{"name":"pip","version":"18.0"},"openssl_version":"LibreSSL 2.6.2","python":"3.5.3","setuptools_version":"40.0.0","system":{"name":"Darwin","release":"17.7.0"}}' + result: + installer: + name: pip + version: '18.0' + python: 3.5.3 + implementation: + name: PyPy + version: 6.0.0 + distro: + name: macOS + version: 10.13.6 + system: + name: Darwin + release: 17.7.0 + cpu: x86_64 + openssl_version: LibreSSL 2.6.2 + setuptools_version: 40.0.0 + +# Pip 1.4 Format +- ua: 'pip/1.4.1 CPython/2.7.14 Darwin/17.7.0' + result: + installer: + name: pip + version: 1.4.1 + python: 2.7.14 + implementation: + name: CPython + version: 2.7.14 + system: + name: Darwin + release: 17.7.0 diff --git a/tests/unit/ua/fixtures/requests.yml b/tests/unit/ua/fixtures/requests.yml new file mode 100644 index 0000000..9ec586f --- /dev/null +++ b/tests/unit/ua/fixtures/requests.yml @@ -0,0 +1,11 @@ +- ua: python-requests/2.19.1 + result: + installer: + name: requests + version: 2.19.1 + +- ua: 'python-requests/2.7.0 CPython/3.7.0 Darwin/17.7.0' + result: + installer: + name: requests + version: 2.7.0 diff --git a/tests/unit/ua/fixtures/setuptools.yml b/tests/unit/ua/fixtures/setuptools.yml new file mode 100644 index 0000000..80f54d3 --- /dev/null +++ b/tests/unit/ua/fixtures/setuptools.yml @@ -0,0 +1,13 @@ +- ua: setuptools/1.0 Python-urllib/3.6 + result: + installer: + name: setuptools + version: '1.0' + python: '3.6' + +- ua: Python-urllib/3.6 setuptools/1.0 + result: + installer: + name: setuptools + version: '1.0' + python: '3.6' diff --git a/tests/unit/ua/fixtures/urllib2.yml b/tests/unit/ua/fixtures/urllib2.yml new file mode 100644 index 0000000..e256f9e --- /dev/null +++ b/tests/unit/ua/fixtures/urllib2.yml @@ -0,0 +1,3 @@ +- ua: Python-urllib/2.7 + result: + python: '2.7' diff --git a/tests/unit/ua/fixtures/z3c-pypimirror.yml b/tests/unit/ua/fixtures/z3c-pypimirror.yml new file mode 100644 index 0000000..0494166 --- /dev/null +++ b/tests/unit/ua/fixtures/z3c-pypimirror.yml @@ -0,0 +1,5 @@ +- ua: z3c.pypimirror/1.5.6 + result: + installer: + name: z3c.pypimirror + version: 1.5.6 diff --git a/tests/unit/ua/test_impl.py b/tests/unit/ua/test_impl.py new file mode 100644 index 0000000..7c2b5ae --- /dev/null +++ b/tests/unit/ua/test_impl.py @@ -0,0 +1,187 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import re + +import pytest + +from linehaul.ua import impl + + +class TestDecorators: + def test_ua_parser(self, monkeypatch): + class FakeCallbackParser: + def __init__(self, *args, **kwargs): + self.args = list(args) + self.kwargs = kwargs + + monkeypatch.setattr(impl, "CallbackUserAgentParser", FakeCallbackParser) + + def MyParserFn(): + pass + + MyParser = impl.ua_parser(MyParserFn) + + assert isinstance(MyParser, FakeCallbackParser) + assert MyParser.args == [MyParserFn] + assert MyParser.kwargs == {} + + @pytest.mark.parametrize("regexes", [(r"^one$",), (r"^one$", r"^two$")]) + def test_regex_us_parser(self, monkeypatch, regexes): + class FakeRegexParser: + def __init__(self, *args, **kwargs): + self.args = list(args) + self.kwargs = kwargs + + monkeypatch.setattr(impl, "RegexUserAgentParser", FakeRegexParser) + + def MyHandlerFn(): + pass + + MyParser = impl.regex_ua_parser(*regexes)(MyHandlerFn) + + assert isinstance(MyParser, FakeRegexParser) + assert MyParser.args == [regexes, MyHandlerFn] + assert MyParser.kwargs == {} + + +class TestCallbackUserAgentParser: + def test_undefined_name(self): + def FakeUserAgent(): + pass + + parser = impl.CallbackUserAgentParser(FakeUserAgent) + assert parser.name == "FakeUserAgent" + + def test_explicit_name(self): + parser = impl.CallbackUserAgentParser(lambda i: i, name="MyName") + assert parser.name == "MyName" + + def test_returns_on_success(self): + result = object() + parser = impl.CallbackUserAgentParser(lambda inp: result) + assert parser("any input") is result + + def test_passed_input(self): + parser = impl.CallbackUserAgentParser(lambda inp: {"input": inp}) + assert parser("another input") == {"input": "another input"} + + +class TestRegexUserAgentParser: + def test_undefined_name(self): + def FakeUserAgent(): + pass + + parser = impl.RegexUserAgentParser([], FakeUserAgent) + assert parser.name == "FakeUserAgent" + + def test_explicit_name(self): + parser = impl.RegexUserAgentParser([], lambda i: i, name="MyName") + assert parser.name == "MyName" + + @pytest.mark.parametrize( + ("regexes", "input"), + [ + ([r"^Foo Bar$"], "Foo Bar"), + ([re.compile(r"^Foo Bar$")], "Foo Bar"), + ([r"^Bar Foo$", re.compile(r"^Foo Bar$")], "Foo Bar"), + ([r"^Bar Foo$", re.compile(r"^Foo Bar$")], "Bar Foo"), + ], + ) + def test_valid(self, regexes, input): + result = object() + parser = impl.RegexUserAgentParser(regexes, lambda: result) + assert parser(input) is result + + @pytest.mark.parametrize( + ("regexes", "input"), + [ + ([], "literally anything"), + ([r"^A Test String$"], "totally not a test string"), + ([r"^One$", re.compile(r"^Two$")], "Three"), + ], + ) + def test_invalid(self, regexes, input): + parser = impl.RegexUserAgentParser(regexes, None, name="AName") + with pytest.raises(impl.UnableToParse): + parser(input) + + def test_positional_captures(self): + def handler(*args): + return list(args) + + parser = impl.RegexUserAgentParser([r"^Foo (.+)$"], handler) + assert parser("Foo Bar") == ["Bar"] + + def test_named_captures(self): + def handler(**kwargs): + return kwargs + + parser = impl.RegexUserAgentParser([r"^Foo (?P.+)$"], handler) + assert parser("Foo Bar") == {"thing": "Bar"} + + def test_mixed_captures(self): + def handler(*args, **kwargs): + return list(args), kwargs + + parser = impl.RegexUserAgentParser( + [r"^(\S+) (?P\S+) (\S+) (?P\S+)$"], handler + ) + assert parser("Foo Bar Widget Frob") == ( + ["Foo", "Widget"], + {"thing": "Bar", "another": "Frob"}, + ) + + +class TestParserSet: + def test_valid(self): + def raiser(inp): + raise impl.UnableToParse + + parser = impl.ParserSet() + + parser.register(raiser) + parser.register(lambda i: {"parsed": "data"}) + parser.register(raiser) + + assert parser("anything") == {"parsed": "data"} + + def test_cannot_parse(self): + def raiser(inp): + raise impl.UnableToParse + + parser = impl.ParserSet() + parser.register(raiser) + + with pytest.raises(impl.UnableToParse): + parser("anything") + + def test_error_while_parsing(self, caplog): + def raiser(inp): + raise ValueError("Oh No") + + raiser.name = "OhNoName" + + parser = impl.ParserSet() + parser.register(raiser) + + with pytest.raises(impl.UnableToParse): + parser("anything") + + assert caplog.record_tuples == [ + ( + "linehaul.ua.impl", + logging.ERROR, + "Error parsing 'anything' as a OhNoName.", + ) + ] diff --git a/tests/unit/ua/test_parser.py b/tests/unit/ua/test_parser.py index 579b73d..4e5f295 100644 --- a/tests/unit/ua/test_parser.py +++ b/tests/unit/ua/test_parser.py @@ -10,4 +10,158 @@ # See the License for the specific language governing permissions and # limitations under the License. -from linehaul.ua.parser import parse +import json +import os.path + +import cattr +import pytest +import yaml + +from hypothesis import given, strategies as st + +from linehaul.ua import parser +from linehaul.ua.datastructures import UserAgent + +from ...strategies import version as st_version + + +FIXTURE_DIR = os.path.join(os.path.dirname(__file__), "fixtures") + + +def _load_ua_fixtures(fixture_dir): + fixtures = os.listdir(fixture_dir) + for filename in fixtures: + with open(os.path.join(fixture_dir, filename), "r") as fp: + fixtures = yaml.safe_load(fp.read()) + for fixture in fixtures: + ua = fixture.pop("ua") + result = fixture.pop("result") + expected = ( + cattr.structure(result, UserAgent) + if isinstance(result, dict) + else result + ) + assert fixture == {} + yield ua, expected + + +@pytest.mark.parametrize(("ua", "expected"), _load_ua_fixtures(FIXTURE_DIR)) +def test_user_agent_parsing(ua, expected): + assert parser.parse(ua) == expected + + +def _is_valid_json(item): + try: + json.loads(item) + except Exception: + return False + + return True + + +class TestPip6UserAgent: + @given(st.text().filter(lambda i: not i.startswith("pip/"))) + def test_not_pip(self, ua): + with pytest.raises(parser.UnableToParse): + parser.Pip6UserAgent(ua) + + @given(st_version(max_version="6")) + def test_invalid_version(self, version): + with pytest.raises(parser.UnableToParse): + parser.Pip6UserAgent(f"pip/{version}") + + @given(st.text(max_size=100).filter(lambda i: not _is_valid_json(i))) + def test_invalid_json(self, json_blob): + with pytest.raises(parser.UnableToParse): + parser.Pip6UserAgent(f"pip/18.0 {json_blob}") + + +class TestPip1_4UserAgent: + @given(st.text().filter(lambda i: not i.startswith("pip/"))) + def test_not_pip(self, ua): + with pytest.raises(parser.UnableToParse): + parser.Pip1_4UserAgent(ua) + + @given(st_version(max_version="1.3") | st_version(min_version="6")) + def test_invalid_version(self, version): + with pytest.raises(parser.UnableToParse): + parser.Pip1_4UserAgent(f"pip/{version} Unknown/Unknown Unknown/Unknown") + + @given(st_version(min_version="1.4", max_version="5")) + def test_no_other_data(self, version): + with pytest.raises(parser.UnableToParse): + parser.Pip1_4UserAgent(f"pip/{version}") + + @given( + st_version(min_version="1.4", max_version="5"), + ( + st.just("Unknown") + | st.just("Cpython") + | st.just("PyPy") + | st.text( + min_size=1, + alphabet=st.characters( + blacklist_categories=["Cc", "Z"], blacklist_characters="/" + ), + ) + ), + ( + st.just("Unknown") + | st_version() + | st.text( + min_size=1, + alphabet=st.characters( + blacklist_categories=["Cc", "Z"], blacklist_characters="/" + ), + ) + ), + ( + st.just("Unknown") + | st.just("Darwin") + | st.just("Linux") + | st.just("Windows") + | st.text( + min_size=1, + alphabet=st.characters( + blacklist_categories=["Cc", "Z"], blacklist_characters="/" + ), + ) + ), + ( + st.just("Unknown") + | st.just("17.7.0") + | st.just("NT") + | st_version() + | st.text( + min_size=1, + alphabet=st.characters( + blacklist_categories=["Cc", "Z"], blacklist_characters="/" + ), + ) + ), + ) + def test_valid_data( + self, version, impl_name, impl_version, system_name, system_release + ): + ua = f"pip/{version} {impl_name}/{impl_version} {system_name}/{system_release}" + + expected = {"installer": {"name": "pip", "version": version}} + if impl_name.lower() != "unknown": + expected.setdefault("implementation", {})["name"] = impl_name + if impl_version.lower() != "unknown": + expected.setdefault("implementation", {})["version"] = impl_version + if system_name.lower() != "unknown": + expected.setdefault("system", {})["name"] = system_name + if system_release.lower() != "unknown": + expected.setdefault("system", {})["release"] = system_release + if impl_name.lower() == "cpython": + expected["python"] = impl_version + + assert parser.Pip1_4UserAgent(ua) == expected + + +class TestParse: + @given(st.text()) + def test_unknown_user_agent(self, user_agent): + with pytest.raises(parser.UnknownUserAgentError): + parser.parse(user_agent)