Skip to content

Commit

Permalink
Merge pull request #29 from pogzyb/fix-get-domain-and-tld
Browse files Browse the repository at this point in the history
v0.4.0 fixes and improvements
  • Loading branch information
pogzyb committed Oct 8, 2021
2 parents 6577e91 + d9fbecd commit 44c7418
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 147 deletions.
2 changes: 1 addition & 1 deletion asyncwhois/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
'rdap_domain_lookup',
'aio_rdap_domain_lookup'
]
__version__ = '0.4.0'
__version__ = '0.4.1'


def lookup(url: str, timeout: int = 10) -> PyWhoIs:
Expand Down
267 changes: 133 additions & 134 deletions asyncwhois/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,143 +308,81 @@ def _init_parser(tld: str) -> BaseParser:
"""
Retrieves the parser instance which can most accurately extract
key/value pairs from the whois server output for the given `tld`.
:param tld: the top level domain
:return: instance of BaseParser or a BaseParser sub-class
"""

# the cases specified below do not follow a common format,
# and so, must be parsed with custom defined regex sub classes.
if tld == 'ae':
return RegexAE()
elif tld == 'ar':
return RegexAR()
elif tld == 'at':
return RegexAT()
elif tld == 'au':
return RegexAU()
elif tld == 'aw':
return RegexAW()
elif tld == 'ax':
return RegexAX()
elif tld == 'be':
return RegexBE()
elif tld == 'br':
return RegexBR()
elif tld == 'by':
return RegexBY()
elif tld == 'cc':
return RegexCC()
elif tld == 'ch':
return RegexCH()
elif tld == 'cl':
return RegexCL()
elif tld == 'cn':
return RegexCN()
elif tld == 'cr':
return RegexCR()
elif tld == 'cz':
return RegexCZ()
elif tld == 'de':
return RegexDE()
elif tld == 'dk':
return RegexDK()
elif tld == 'edu':
return RegexEDU()
elif tld == 'ee':
return RegexEE()
elif tld == 'eu':
return RegexEU()
elif tld == 'fi':
return RegexFI()
elif tld == 'fr':
return RegexFR()
elif tld == 'ge':
return RegexGE()
elif tld == 'gg':
return RegexGG()
elif tld == 'gq':
return RegexGQ()
elif tld == 'hk':
return RegexHK()
elif tld == 'hr':
return RegexHR()
elif tld == 'id':
return RegexID()
elif tld == 'ie':
return RegexIE()
elif tld == 'il':
return RegexIL()
elif tld == 'ir':
return RegexIR()
elif tld == 'is':
return RegexIS()
elif tld == 'it':
return RegexIT()
elif tld == 'jp':
return RegexJP()
elif tld == 'kg':
return RegexKG()
elif tld == 'kr':
return RegexKR()
elif tld == 'kz':
return RegexKZ()
elif tld == 'li':
return RegexLI()
elif tld == 'lu':
return RegexLU()
elif tld == 'lv':
return RegexLV()
elif tld == 'ma':
return RegexMA()
elif tld == 'mx':
return RegexMX()
elif tld == 'nl':
return RegexNL()
elif tld == 'no':
return RegexNO()
elif tld == 'nu':
return RegexNU()
elif tld == 'nz':
return RegexNZ()
elif tld == 'pe':
return RegexPE()
elif tld == 'pl':
return RegexPL()
elif tld == 'pt':
return RegexPT()
elif tld == 'rf':
return RegexRF()
elif tld == 'ro':
return RegexRO()
elif tld == 'ru':
return RegexRU()
elif tld == 'sa':
return RegexSA()
elif tld == 'se':
return RegexSE()
elif tld == 'si':
return RegexSI()
elif tld == 'sk':
return RegexSK()
elif tld == 'su':
return RegexSU()
elif tld == 'tk':
return RegexTK()
elif tld == 'tr':
return RegexTR()
elif tld == 'tw':
return RegexTW()
elif tld == 'ua':
return RegexUA()
elif tld == 'uk':
return RegexUK()
elif tld == 've':
return RegexVE()
else:
# The BaseParser can handle all "Generic" and some "Country-Code" TLDs.
# If the parsed output of lookup is not what you expect or even incorrect,
# check for and then modify the existing Regex subclass or create a new one.
return BaseParser()
tld_parsers = {
'ae': RegexAE(),
'ar': RegexAR(),
'at': RegexAT(),
'au': RegexAU(),
'aw': RegexAW(),
'ax': RegexAX(),
'be': RegexBE(),
'br': RegexBR(),
'by': RegexBY(),
'cc': RegexCC(),
'ch': RegexCH(),
'cl': RegexCL(),
'cn': RegexCN(),
'cr': RegexCR(),
'cz': RegexCZ(),
'de': RegexDE(),
'dk': RegexDK(),
'edu': RegexEDU(),
'ee': RegexEE(),
'eu': RegexEU(),
'fi': RegexFI(),
'fr': RegexFR(),
'ge': RegexGE(),
'gg': RegexGG(),
'gq': RegexGQ(),
'hk': RegexHK(),
'hr': RegexHR(),
'id': RegexID(),
'ie': RegexIE(),
'il': RegexIL(),
'ir': RegexIR(),
'is': RegexIS(),
'it': RegexIT(),
'jp': RegexJP(),
'kg': RegexKG(),
'kr': RegexKR(),
'kz': RegexKZ(),
'li': RegexLI(),
'lu': RegexLU(),
'lv': RegexLV(),
'ma': RegexMA(),
'ml': RegexML(),
'mx': RegexMX(),
'nl': RegexNL(),
'no': RegexNO(),
'nu': RegexNU(),
'nz': RegexNZ(),
'om': RegexOM(),
'pe': RegexPE(),
'pl': RegexPL(),
'pt': RegexPT(),
'rf': RegexRF(),
'ro': RegexRO(),
'ru': RegexRU(),
'sa': RegexSA(),
'se': RegexSE(),
'si': RegexSI(),
'sk': RegexSK(),
'su': RegexSU(),
'tk': RegexTK(),
'tr': RegexTR(),
'tw': RegexTW(),
'ua': RegexUA(),
'uk': RegexUK(),
've': RegexVE()
}
# The BaseParser can handle all "Generic" and some "Country-Code" TLDs.
# If the parsed output of lookup is not what you expect or even incorrect,
# check for and then modify the existing Regex subclass or create a new one.
return tld_parsers.get(tld, BaseParser())


# ==============================
Expand Down Expand Up @@ -1586,3 +1524,64 @@ def parse(self, blob: str) -> Dict[str, Any]:
if addresses:
parsed_output[BaseKeys.REGISTRANT_ADDRESS] = ', '.join(addresses)
return parsed_output


class RegexML(BaseParser):
_ml_expressions = {
BaseKeys.EXPIRES: r'Record will expire on: *(.+)',
BaseKeys.CREATED: r'Domain registered: *(.+)',
BaseKeys.DOMAIN_NAME: r'Domain name:\n*(.+)\sis\s',
BaseKeys.STATUS: r'Domain name:\n.+\sis\s*(.+)'
}

def __init__(self):
super().__init__()
self.update_reg_expressions(self._ml_expressions)

def parse(self, blob: str) -> Dict[str, Any]:
parsed_output = super().parse(blob)
parsed_output[BaseKeys.NAME_SERVERS] = self.find_multiline_match('Domain nameservers:', blob)
for contact in ('Admin', 'Billing', 'Owner', 'Tech'):
# isolate the appropriate contact block
contact_blob = re.search(f'{contact} contact:\n(.+)\n\n', blob, re.DOTALL)
if contact_blob:
if contact == 'Owner':
# map "owner" to registrant
contact = 'Registrant'
for key in ('Organization', 'Name', 'Address', 'Zipcode', 'City',
'State', 'Country', 'Phone', 'Fax', 'E-mail'):
# special case: Email -> E-mail
if key == 'E-mail':
base_key = getattr(BaseKeys, f'{contact}_Email'.upper())
else:
base_key = getattr(BaseKeys, f'{contact}_{key}'.upper())
if not base_key:
continue
# updated parser dict
parsed_output[base_key] = self.find_match(f'{key}: *(.+)', contact_blob.group(0))
date_format = '%m/%d/%Y' # example: 05/28/2013
parsed_output[BaseKeys.EXPIRES] = datetime.datetime.strptime(parsed_output[BaseKeys.EXPIRES], date_format)
parsed_output[BaseKeys.CREATED] = datetime.datetime.strptime(parsed_output[BaseKeys.CREATED], date_format)
return parsed_output


class RegexOM(BaseParser):
_om_expressions = {
BaseKeys.REGISTRAR: r'Registrar Name: *(.+)',
BaseKeys.UPDATED: r'Last Modified: *(.+)',
BaseKeys.REGISTRANT_CITY: r'Registrant Contact City: *(.+)',
BaseKeys.REGISTRANT_COUNTRY: r'Registrant Contact Country: *(.+)',
BaseKeys.REGISTRANT_ORGANIZATION: r'Registrant Contact Organisation: *(.+)',
BaseKeys.REGISTRANT_NAME: r'Registrant Contact Name: *(.+)',
BaseKeys.REGISTRANT_EMAIL: r'Registrant Contact Email: *(.+)',
BaseKeys.TECH_CITY: r'Tech Contact City: *(.+)',
BaseKeys.TECH_COUNTRY: r'Tech Contact Country: *(.+)',
BaseKeys.TECH_ORGANIZATION: r'Tech Contact Organisation: *(.+)',
BaseKeys.TECH_NAME: r'Tech Contact Name: *(.+)',
BaseKeys.TECH_EMAIL: r'Tech Contact Email: *(.+)',
BaseKeys.NAME_SERVERS: r'Name Server: *(.+)',
}

def __init__(self):
super().__init__()
self.update_reg_expressions(self._om_expressions)
13 changes: 9 additions & 4 deletions asyncwhois/pywhois.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ async def _aio_get_domain_and_tld(self, url: str):
if len(tld.split('.')) > 1:
tld = tld.split('.')[-1]

self.subdomain = extract_result.subdomain
self.domain = extract_result.domain
self.suffix = extract_result.suffix
self.tld = tld

return extract_result.domain, tld

def _get_domain_and_tld(self, url: str):
Expand Down Expand Up @@ -200,12 +205,12 @@ async def _aio_from_url(cls, url: str, timeout: int):
@classmethod
async def _aio_rdap_domain_from_url(cls, url: str, http_client: Any = None):
"""
Performs an RDAP query by leveraging whodap.aio_lookup_domain;
Performs an RDAP query by leveraging `whodap.aio_lookup_domain`;
stores the resulting RDAP output into "query_output" and a WHOIS friendly
key/value pair dictionary into "parser_output".
:param url: the given url to search
:param http_client_kws: keyword arguments passed directly to the underlying httpx client
:param http_client: the underlying httpx client to pass to `whodap.aio_lookup_domain`
:return: initialized instance of PyWhoIs
"""
pywhois = cls()
Expand All @@ -224,12 +229,12 @@ async def _aio_rdap_domain_from_url(cls, url: str, http_client: Any = None):
@classmethod
def _rdap_domain_from_url(cls, url: str, http_client: Any = None):
"""
Performs an RDAP query by leveraging whodap.lookup_domain;
Performs an RDAP query by leveraging `whodap.lookup_domain`;
stores the resulting RDAP output into "query_output" and a WHOIS friendly
key/value pair dictionary into "parser_output".
:param url: the given url to search
:param http_client_kws: keyword arguments passed directly to the underlying httpx client
:param http_client: the underlying httpx client to pass to `whodap.lookup_domain`
:return: initialized instance of PyWhoIs
"""
pywhois = cls()
Expand Down
20 changes: 12 additions & 8 deletions asyncwhois/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,25 +46,26 @@ def _run(self):
self.server = self._find_match(regex=r"refer: *(.+)", blob=iana_result_blob)
if not self.server:
raise QueryError(f"Could not find a whois server for {self.domain}")

# connect to <server>:43
with self._create_connection((self.server, self._whois_port), self.timeout) as conn:
# save output into "query_output"
self.query_output = self._send_and_recv(conn, data)
# check for "authoritative" whois server via regex
whois_server = self._find_match(regex=r"WHOIS server: *(.+)", blob=self.query_output)
if whois_server:
whois_server = self._find_match(regex=r"whois server: *(.+)", blob=self.query_output)
whois_server = whois_server.replace(' ', '').rstrip(':')
if whois_server and self.server != whois_server:
# if there is a more authoritative source; connect and re-query
self.server = whois_server
with self._create_connection((self.server, self._whois_port), self.timeout) as conn:
# save output into "query_output"
self.query_output = self._send_and_recv(conn, data)

except ConnectionRefusedError:
raise QueryError(f'')

except ConnectionResetError:
server = self.server or self._iana_server
raise QueryError(f'"Connection reset by peer" when communicating with {server}:43')
except socket.timeout:
server = self.server or self._iana_server
raise QueryError(f'Socket timed out when attempting to reach {server}:43')

@staticmethod
def _send_and_recv(conn: socket.socket, data: str) -> str:
Expand All @@ -83,7 +84,9 @@ def _create_connection(address: Tuple[str, int], timeout: int) -> socket.socket:
try:
return socket.create_connection(address=address, timeout=timeout)
except socket.timeout:
raise QueryError(f'Could not reach WHOIS server at {address[0]}:{address[1]}')
raise QueryError(f'Connection timeout for WHOIS server at {address[0]}:{address[1]}')
except socket.gaierror:
raise QueryError(f'Could not get address information for {address[0]}:{address[1]}')


class AsyncWhoIsQuery(Query):
Expand Down Expand Up @@ -119,7 +122,8 @@ async def _run(self) -> None:

reader, writer = await self._create_connection((self.server, self._whois_port), self.timeout)
self.query_output = await self._send_and_recv(reader, writer, data)
whois_server = self._find_match(regex=r"WHOIS server: *(.+)", blob=self.query_output)
whois_server = self._find_match(regex=r"whois server: *(.+)", blob=self.query_output)
whois_server = whois_server.replace(' ', '').rstrip(':')
if whois_server:
self.server = whois_server
reader, writer = await self._create_connection((self.server, self._whois_port), self.timeout)
Expand Down

0 comments on commit 44c7418

Please sign in to comment.