Merge pull request #29 from pogzyb/fix-get-domain-and-tld

v0.4.0 fixes and improvements
pogzyb · Oct 8, 2021 · 44c7418 · 44c7418
2 parents 6577e91 + d9fbecd
commit 44c7418
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 147 deletions.
diff --git a/asyncwhois/__init__.py b/asyncwhois/__init__.py
@@ -11,7 +11,7 @@
     'rdap_domain_lookup',
     'aio_rdap_domain_lookup'
 ]
-__version__ = '0.4.0'
+__version__ = '0.4.1'
 
 
 def lookup(url: str, timeout: int = 10) -> PyWhoIs:

diff --git a/asyncwhois/parser.py b/asyncwhois/parser.py
@@ -308,143 +308,81 @@ def _init_parser(tld: str) -> BaseParser:
         """
         Retrieves the parser instance which can most accurately extract
         key/value pairs from the whois server output for the given `tld`.
+
         :param tld: the top level domain
         :return: instance of BaseParser or a BaseParser sub-class
         """
-
-        # the cases specified below do not follow a common format,
-        # and so, must be parsed with custom defined regex sub classes.
-        if tld == 'ae':
-            return RegexAE()
-        elif tld == 'ar':
-            return RegexAR()
-        elif tld == 'at':
-            return RegexAT()
-        elif tld == 'au':
-            return RegexAU()
-        elif tld == 'aw':
-            return RegexAW()
-        elif tld == 'ax':
-            return RegexAX()
-        elif tld == 'be':
-            return RegexBE()
-        elif tld == 'br':
-            return RegexBR()
-        elif tld == 'by':
-            return RegexBY()
-        elif tld == 'cc':
-            return RegexCC()
-        elif tld == 'ch':
-            return RegexCH()
-        elif tld == 'cl':
-            return RegexCL()
-        elif tld == 'cn':
-            return RegexCN()
-        elif tld == 'cr':
-            return RegexCR()
-        elif tld == 'cz':
-            return RegexCZ()
-        elif tld == 'de':
-            return RegexDE()
-        elif tld == 'dk':
-            return RegexDK()
-        elif tld == 'edu':
-            return RegexEDU()
-        elif tld == 'ee':
-            return RegexEE()
-        elif tld == 'eu':
-            return RegexEU()
-        elif tld == 'fi':
-            return RegexFI()
-        elif tld == 'fr':
-            return RegexFR()
-        elif tld == 'ge':
-            return RegexGE()
-        elif tld == 'gg':
-            return RegexGG()
-        elif tld == 'gq':
-            return RegexGQ()
-        elif tld == 'hk':
-            return RegexHK()
-        elif tld == 'hr':
-            return RegexHR()
-        elif tld == 'id':
-            return RegexID()
-        elif tld == 'ie':
-            return RegexIE()
-        elif tld == 'il':
-            return RegexIL()
-        elif tld == 'ir':
-            return RegexIR()
-        elif tld == 'is':
-            return RegexIS()
-        elif tld == 'it':
-            return RegexIT()
-        elif tld == 'jp':
-            return RegexJP()
-        elif tld == 'kg':
-            return RegexKG()
-        elif tld == 'kr':
-            return RegexKR()
-        elif tld == 'kz':
-            return RegexKZ()
-        elif tld == 'li':
-            return RegexLI()
-        elif tld == 'lu':
-            return RegexLU()
-        elif tld == 'lv':
-            return RegexLV()
-        elif tld == 'ma':
-            return RegexMA()
-        elif tld == 'mx':
-            return RegexMX()
-        elif tld == 'nl':
-            return RegexNL()
-        elif tld == 'no':
-            return RegexNO()
-        elif tld == 'nu':
-            return RegexNU()
-        elif tld == 'nz':
-            return RegexNZ()
-        elif tld == 'pe':
-            return RegexPE()
-        elif tld == 'pl':
-            return RegexPL()
-        elif tld == 'pt':
-            return RegexPT()
-        elif tld == 'rf':
-            return RegexRF()
-        elif tld == 'ro':
-            return RegexRO()
-        elif tld == 'ru':
-            return RegexRU()
-        elif tld == 'sa':
-            return RegexSA()
-        elif tld == 'se':
-            return RegexSE()
-        elif tld == 'si':
-            return RegexSI()
-        elif tld == 'sk':
-            return RegexSK()
-        elif tld == 'su':
-            return RegexSU()
-        elif tld == 'tk':
-            return RegexTK()
-        elif tld == 'tr':
-            return RegexTR()
-        elif tld == 'tw':
-            return RegexTW()
-        elif tld == 'ua':
-            return RegexUA()
-        elif tld == 'uk':
-            return RegexUK()
-        elif tld == 've':
-            return RegexVE()
-        else:
-            # The BaseParser can handle all "Generic" and some "Country-Code" TLDs.
-            # If the parsed output of lookup is not what you expect or even incorrect,
-            # check for and then modify the existing Regex subclass or create a new one.
-            return BaseParser()
+        tld_parsers = {
+            'ae': RegexAE(),
+            'ar': RegexAR(),
+            'at': RegexAT(),
+            'au': RegexAU(),
+            'aw': RegexAW(),
+            'ax': RegexAX(),
+            'be': RegexBE(),
+            'br': RegexBR(),
+            'by': RegexBY(),
+            'cc': RegexCC(),
+            'ch': RegexCH(),
+            'cl': RegexCL(),
+            'cn': RegexCN(),
+            'cr': RegexCR(),
+            'cz': RegexCZ(),
+            'de': RegexDE(),
+            'dk': RegexDK(),
+            'edu': RegexEDU(),
+            'ee': RegexEE(),
+            'eu': RegexEU(),
+            'fi': RegexFI(),
+            'fr': RegexFR(),
+            'ge': RegexGE(),
+            'gg': RegexGG(),
+            'gq': RegexGQ(),
+            'hk': RegexHK(),
+            'hr': RegexHR(),
+            'id': RegexID(),
+            'ie': RegexIE(),
+            'il': RegexIL(),
+            'ir': RegexIR(),
+            'is': RegexIS(),
+            'it': RegexIT(),
+            'jp': RegexJP(),
+            'kg': RegexKG(),
+            'kr': RegexKR(),
+            'kz': RegexKZ(),
+            'li': RegexLI(),
+            'lu': RegexLU(),
+            'lv': RegexLV(),
+            'ma': RegexMA(),
+            'ml': RegexML(),
+            'mx': RegexMX(),
+            'nl': RegexNL(),
+            'no': RegexNO(),
+            'nu': RegexNU(),
+            'nz': RegexNZ(),
+            'om': RegexOM(),
+            'pe': RegexPE(),
+            'pl': RegexPL(),
+            'pt': RegexPT(),
+            'rf': RegexRF(),
+            'ro': RegexRO(),
+            'ru': RegexRU(),
+            'sa': RegexSA(),
+            'se': RegexSE(),
+            'si': RegexSI(),
+            'sk': RegexSK(),
+            'su': RegexSU(),
+            'tk': RegexTK(),
+            'tr': RegexTR(),
+            'tw': RegexTW(),
+            'ua': RegexUA(),
+            'uk': RegexUK(),
+            've': RegexVE()
+        }
+        # The BaseParser can handle all "Generic" and some "Country-Code" TLDs.
+        # If the parsed output of lookup is not what you expect or even incorrect,
+        # check for and then modify the existing Regex subclass or create a new one.
+        return tld_parsers.get(tld, BaseParser())
 
 
 # ==============================
@@ -1586,3 +1524,64 @@ def parse(self, blob: str) -> Dict[str, Any]:
         if addresses:
             parsed_output[BaseKeys.REGISTRANT_ADDRESS] = ', '.join(addresses)
         return parsed_output
+
+
+class RegexML(BaseParser):
+    _ml_expressions = {
+        BaseKeys.EXPIRES: r'Record will expire on: *(.+)',
+        BaseKeys.CREATED: r'Domain registered: *(.+)',
+        BaseKeys.DOMAIN_NAME: r'Domain name:\n*(.+)\sis\s',
+        BaseKeys.STATUS: r'Domain name:\n.+\sis\s*(.+)'
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.update_reg_expressions(self._ml_expressions)
+
+    def parse(self, blob: str) -> Dict[str, Any]:
+        parsed_output = super().parse(blob)
+        parsed_output[BaseKeys.NAME_SERVERS] = self.find_multiline_match('Domain nameservers:', blob)
+        for contact in ('Admin', 'Billing', 'Owner', 'Tech'):
+            # isolate the appropriate contact block
+            contact_blob = re.search(f'{contact} contact:\n(.+)\n\n', blob, re.DOTALL)
+            if contact_blob:
+                if contact == 'Owner':
+                    # map "owner" to registrant
+                    contact = 'Registrant'
+                for key in ('Organization', 'Name', 'Address', 'Zipcode', 'City',
+                            'State', 'Country', 'Phone', 'Fax', 'E-mail'):
+                    # special case: Email -> E-mail
+                    if key == 'E-mail':
+                        base_key = getattr(BaseKeys, f'{contact}_Email'.upper())
+                    else:
+                        base_key = getattr(BaseKeys, f'{contact}_{key}'.upper())
+                    if not base_key:
+                        continue
+                    # updated parser dict
+                    parsed_output[base_key] = self.find_match(f'{key}: *(.+)', contact_blob.group(0))
+        date_format = '%m/%d/%Y' # example: 05/28/2013
+        parsed_output[BaseKeys.EXPIRES] = datetime.datetime.strptime(parsed_output[BaseKeys.EXPIRES], date_format)
+        parsed_output[BaseKeys.CREATED] = datetime.datetime.strptime(parsed_output[BaseKeys.CREATED], date_format)
+        return parsed_output
+
+
+class RegexOM(BaseParser):
+    _om_expressions = {
+        BaseKeys.REGISTRAR: r'Registrar Name: *(.+)',
+        BaseKeys.UPDATED: r'Last Modified: *(.+)',
+        BaseKeys.REGISTRANT_CITY: r'Registrant Contact City: *(.+)',
+        BaseKeys.REGISTRANT_COUNTRY: r'Registrant Contact Country: *(.+)',
+        BaseKeys.REGISTRANT_ORGANIZATION: r'Registrant Contact Organisation: *(.+)',
+        BaseKeys.REGISTRANT_NAME: r'Registrant Contact Name: *(.+)',
+        BaseKeys.REGISTRANT_EMAIL: r'Registrant Contact Email: *(.+)',
+        BaseKeys.TECH_CITY: r'Tech Contact City: *(.+)',
+        BaseKeys.TECH_COUNTRY: r'Tech Contact Country: *(.+)',
+        BaseKeys.TECH_ORGANIZATION: r'Tech Contact Organisation: *(.+)',
+        BaseKeys.TECH_NAME: r'Tech Contact Name: *(.+)',
+        BaseKeys.TECH_EMAIL: r'Tech Contact Email: *(.+)',
+        BaseKeys.NAME_SERVERS: r'Name Server: *(.+)',
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.update_reg_expressions(self._om_expressions)
diff --git a/asyncwhois/pywhois.py b/asyncwhois/pywhois.py
@@ -54,6 +54,11 @@ async def _aio_get_domain_and_tld(self, url: str):
         if len(tld.split('.')) > 1:
             tld = tld.split('.')[-1]
 
+        self.subdomain = extract_result.subdomain
+        self.domain = extract_result.domain
+        self.suffix = extract_result.suffix
+        self.tld = tld
+
         return extract_result.domain, tld
 
     def _get_domain_and_tld(self, url: str):
@@ -200,12 +205,12 @@ async def _aio_from_url(cls, url: str, timeout: int):
     @classmethod
     async def _aio_rdap_domain_from_url(cls, url: str, http_client: Any = None):
         """
-        Performs an RDAP query by leveraging whodap.aio_lookup_domain;
+        Performs an RDAP query by leveraging `whodap.aio_lookup_domain`;
         stores the resulting RDAP output into "query_output" and a WHOIS friendly
         key/value pair dictionary into "parser_output".
 
         :param url: the given url to search
-        :param http_client_kws: keyword arguments passed directly to the underlying httpx client
+        :param http_client: the underlying httpx client to pass to `whodap.aio_lookup_domain`
         :return: initialized instance of PyWhoIs
         """
         pywhois = cls()
@@ -224,12 +229,12 @@ async def _aio_rdap_domain_from_url(cls, url: str, http_client: Any = None):
     @classmethod
     def _rdap_domain_from_url(cls, url: str, http_client: Any = None):
         """
-        Performs an RDAP query by leveraging whodap.lookup_domain;
+        Performs an RDAP query by leveraging `whodap.lookup_domain`;
         stores the resulting RDAP output into "query_output" and a WHOIS friendly
         key/value pair dictionary into "parser_output".
 
         :param url: the given url to search
-        :param http_client_kws: keyword arguments passed directly to the underlying httpx client
+        :param http_client: the underlying httpx client to pass to `whodap.lookup_domain`
         :return: initialized instance of PyWhoIs
         """
         pywhois = cls()

diff --git a/asyncwhois/query.py b/asyncwhois/query.py
@@ -46,25 +46,26 @@ def _run(self):
                     self.server = self._find_match(regex=r"refer: *(.+)", blob=iana_result_blob)
                     if not self.server:
                         raise QueryError(f"Could not find a whois server for {self.domain}")
-
             # connect to <server>:43
             with self._create_connection((self.server, self._whois_port), self.timeout) as conn:
                 # save output into "query_output"
                 self.query_output = self._send_and_recv(conn, data)
                 # check for "authoritative" whois server via regex
-                whois_server = self._find_match(regex=r"WHOIS server: *(.+)", blob=self.query_output)
-                if whois_server:
+                whois_server = self._find_match(regex=r"whois server: *(.+)", blob=self.query_output)
+                whois_server = whois_server.replace(' ', '').rstrip(':')
+                if whois_server and self.server != whois_server:
                     # if there is a more authoritative source; connect and re-query
                     self.server = whois_server
                     with self._create_connection((self.server, self._whois_port), self.timeout) as conn:
                         # save output into "query_output"
                         self.query_output = self._send_and_recv(conn, data)
+
+        except ConnectionRefusedError:
+            raise QueryError(f'')
+
         except ConnectionResetError:
             server = self.server or self._iana_server
             raise QueryError(f'"Connection reset by peer" when communicating with {server}:43')
-        except socket.timeout:
-            server = self.server or self._iana_server
-            raise QueryError(f'Socket timed out when attempting to reach {server}:43')
 
     @staticmethod
     def _send_and_recv(conn: socket.socket, data: str) -> str:
@@ -83,7 +84,9 @@ def _create_connection(address: Tuple[str, int], timeout: int) -> socket.socket:
         try:
             return socket.create_connection(address=address, timeout=timeout)
         except socket.timeout:
-            raise QueryError(f'Could not reach WHOIS server at {address[0]}:{address[1]}')
+            raise QueryError(f'Connection timeout for WHOIS server at {address[0]}:{address[1]}')
+        except socket.gaierror:
+            raise QueryError(f'Could not get address information for {address[0]}:{address[1]}')
 
 
 class AsyncWhoIsQuery(Query):
@@ -119,7 +122,8 @@ async def _run(self) -> None:
 
             reader, writer = await self._create_connection((self.server, self._whois_port), self.timeout)
             self.query_output = await self._send_and_recv(reader, writer, data)
-            whois_server = self._find_match(regex=r"WHOIS server: *(.+)", blob=self.query_output)
+            whois_server = self._find_match(regex=r"whois server: *(.+)", blob=self.query_output)
+            whois_server = whois_server.replace(' ', '').rstrip(':')
             if whois_server:
                 self.server = whois_server
                 reader, writer = await self._create_connection((self.server, self._whois_port), self.timeout)