## Parsing the Mozilla PSL
The PSL is composed of three sections:
- ICANN DOMAINS country code
- ICANN DOMAINS generic
- PRIVATE DOMAINS

The TLDs of the first two sections eTLDs should be included in the `tld-list` of IANA, which separates Country-Code and Generic TLD.

The last one instead is available only in the Mozilla list, because these eTLDs are less *standard*.

In [1]:
# getting the IANA list with category Country-Code and Generic

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os, datetime, time
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

def get_file(fname, fetch_func):
    if os.path.exists(fname):
        ts = os.path.getmtime(fname)
        tdelta = datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(ts)
        if tdelta.days < 7:
            return
    flines = fetch_func(fname)
    with open(fname, 'w') as f:
        f.writelines(flines)
    return


get_file(
    'iana.csv',
    lambda fname: pd.read_html('https://www.iana.org/domains/root/db', attrs = {'id': 'tld-table'})[0].to_csv()
)
get_file(
    'tldlist.csv',
    lambda fname: pd.read_csv('https://tld-list.com/df/tld-list-details.csv').to_csv()
)
get_file(
    'public_suffix_list.dat',
    lambda fname: requests.get(
        'https://publicsuffix.org/list/public_suffix_list.dat',
        verify=False,
        allow_redirects=True,
        timeout=5
    ).text
)


df_iana = pd.read_csv('iana.csv', index_col=0)

In [2]:
# getting the IANA list with category Country-Code and Generic

df_iana = pd.read_csv('iana.csv', index_col=0)
df_tldlist = pd.read_csv('tldlist.csv', index_col=0)
        
df_iana = df_iana.rename(columns={
    'Domain': 'tld', 'Type': 'type', 'TLD Manager': 'manager'
})

df_tldlist = df_tldlist.rename(columns={'Punycode': 'punycode'})

if (df_iana['tld'].apply(lambda tld: tld.count('.') > 1)).sum() > 0:
    raise 'Unexpected: TLDs should have only one point each.'

# cleaning TLDs from points and special right-to-left character
df_iana['tld'] = df_iana.tld.str.replace('.', '', n=1, regex=False)
df_iana['tld'] = df_iana.tld.str.replace('\u200f', '', n=1, regex=False)
df_iana['tld'] = df_iana.tld.str.replace('\u200e', '', n=1, regex=False)

# converting Type labels to IANA naming convention
df_tldlist['Type'] = df_tldlist['Type'].str.replace('gTLD', 'generic', regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('ccTLD', 'country-code', regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('grTLD', 'generic-restricted',regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('sTLD', 'sponsored', regex=False)


df = df_iana.merge(df_tldlist, left_on='tld', right_on='TLD', how='outer')

# check TLDs types are the same except for 'music' and pakistan-one TLD
if df[(~(df.Type == df['type']))].shape[0] > 2:
    print(df[(~(df.Type == df['type']))])
    raise 'Error: Types not coincided'

# check TLDs are the same except for 'music' TLD
iana_notin_tldlist = df[df.TLD.isna()].tld
if iana_notin_tldlist.shape[0] > 0:
    print(f'Warning: IANA has {iana_notin_tldlist.shape[0]} TLDs not contained in tldlist')
tldlist_notin_iana = df[df.tld.isna()].TLD
if tldlist_notin_iana.shape[0] > 0:
    print(f'Warning: tldlist has {tldlist_notin_iana.shape[0]} TLDs not contained in IANA')

# merging not shared TLDs
nans = df.tld.isna()
df['tld'].values[nans] = df[nans].TLD
df['type'].values[nans] = df[nans].Type
df['manager'].values[nans] = df[nans].Sponsor

# creating a column with TLD and punycode when TLD is in a not-english like language
# punicode_isna = df['punycode'].isna()
# df['punycode'].values[punicode_isna] = df[punicode_isna].tld

df_tld = df[['tld', 'punycode', 'type', 'manager']].copy()

df_tld[df_tld.tld == '嘉里大酒店']



Unnamed: 0,tld,punycode,type,manager
1553,嘉里大酒店,xn--w4r85el8fhu5dnra,generic,Kerry Trading Co. Limited


In [60]:
# getting the PSL list, considering the sections defined above, and parsing the comments.
import re


with open('public_suffix_list.dat', 'r') as f:
    psl_lines = [ l.replace('\n', '') for l in f.readlines()]


sections_delimiters = [
    '// ===BEGIN ICANN DOMAINS===',
    '// newGTLDs',
    '// ===BEGIN PRIVATE DOMAINS==='
]
sections_names = [
    'icann',
    'icann-new',
    'private-domains'
]

regex_punycode = r'^\/\/ (xn--.*?) .*$'
regex_comment = r'^\/\/ (?!Submitted)(.*?)(?: : )(.*?)$'

line_start = 1 + psl_lines.index('// ===BEGIN ICANN DOMAINS===')

sd = 0
manager = None
punycode = None
values = []
last_tld = ''
punycode_found = False
for i in range(line_start, len(psl_lines)):
    line = psl_lines[i]
    if len(line) == 0: continue
    if sd+1 < len(sections_delimiters) and line.find(sections_delimiters[sd+1]) == 0:
        sd += 1
    if line.find('//') == 0:
        punycode_match = re.match(regex_punycode, line)
        if punycode_match is not None:
            punycode_found = True
            punycode = punycode_match[1]
        else:
            first_comment_match = re.match(regex_comment, line)
            if first_comment_match is not None:
                manager = first_comment_match[1]
        continue
        
    tld = line
    tld = tld[tld.rfind('.')+1:]
    
    if punycode == 'xn--mgba3a4fra.ir':
        print(punycode_found, tld, f'last: {last_tld}', last_tld != tld and not punycode_found)
    
    if last_tld != tld and not punycode_found:
        punycode = None
    
    punycode_found = False
    
    values.append([ sections_names[sd], tld, punycode, line, manager ])
    
    last_tld = tld
    pass

df_etld = pd.DataFrame(values, columns=['type', 'tld', 'punycode', 'suffix', 'manager'])

df_etld = df_etld[['type', 'tld', 'punycode', 'suffix', 'manager']].reset_index()

True ir last: ir False
False is last: ir True


In [73]:
# the merge will be done with the tld column

df = df_etld.merge(df_tld, left_on='tld', right_on='tld', suffixes=['_etld', '_tld'], how='outer')

df['from_psl'] = ~df['type_etld'].isna()
df['from_iana'] = ~df['type_tld'].isna()

if ((df['from_psl'] == False) & (df['from_iana'] == False)).sum() > 0:
    print('Warning: something is wrong')
    

df = df.reset_index(drop=True)

df['type'] = df['type_tld']

icann_pd = (df['type_etld'] == 'private-domains')
df['type'].values[icann_pd] = 'private-domains'


icann_pd = (df['type_tld'].isna() & df['punycode_etld'].isna())
df['type'].values[icann_pd] = 'other'

icann_pd = (df['type_tld'].isna() & (~df['punycode_etld'].isna()))
df['type'].values[icann_pd] = 'orphan-punycode'


df = df[['suffix', 'tld', 'punycode_tld', 'punycode_etld', 'from_psl', 'from_iana', 'type', 'type_tld', 'type_etld', 'manager_etld', 'manager_tld']]

suffix_na = df.suffix.isna()
if df[suffix_na].shape[0] > 0:
    print(f'Info: there are {df[suffix_na].shape[0]} NaN Suffixes')
df.suffix.values[suffix_na] = df.tld[suffix_na]

tld_na = df.tld.isna()
if df[tld_na].shape[0] > 0:
    print(f'Warning: there are {df[tld_na].shape[0]} NaN TLDs')

df = df.fillna('')

df.to_csv('tld_and_suffixes.csv')

df[df['from_psl'] ^ df['from_iana']].to_csv('differents.csv')

Info: there are 105 NaN Suffixes
