## Parsing the Mozilla PSL
The PSL is composed of three sections:
- ICANN DOMAINS country code
- ICANN DOMAINS generic
- PRIVATE DOMAINS

The TLDs of the first two sections eTLDs should be included in the `tld-list` of IANA, which separates Country-Code and Generic TLD.

The last one instead is available only in the Mozilla list, because these eTLDs are less *standard*.

In [1]:
# getting the IANA list with category Country-Code and Generic

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os, datetime, time
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

ds = pd.read_csv('/home/princio/Desktop/malware_detection/nn/nn/dataset_training.csv')

def get_file(fname, fetch_func):
    if os.path.exists(fname):
        ts = os.path.getmtime(fname)
        tdelta = datetime.datetime.utcnow() - datetime.datetime.utcfromtimestamp(ts)
        if tdelta.days < 7:
            return
    flines = fetch_func(fname)
    with open(fname, 'w') as f:
        f.writelines(flines)
    return


get_file(
    'iana.csv',
    lambda fname: pd.read_html('https://www.iana.org/domains/root/db', attrs = {'id': 'tld-table'})[0].to_csv()
)
get_file(
    'tldlist.csv',
    lambda fname: pd.read_csv('https://tld-list.com/df/tld-list-details.csv').to_csv()
)
get_file(
    'public_suffix_list.dat',
    lambda fname: requests.get(
        'https://publicsuffix.org/list/public_suffix_list.dat',
        verify=False,
        allow_redirects=True,
        timeout=5
    ).text
)


df_iana = pd.read_csv('iana.csv', index_col=0)

In [2]:
# getting the IANA list with category Country-Code and Generic

df_iana = pd.read_csv('iana.csv', index_col=0)
df_tldlist = pd.read_csv('tldlist.csv', index_col=0)
        
df_iana = df_iana.rename(columns={
    'Domain': 'tld', 'Type': 'type', 'TLD Manager': 'manager'
})

df_tldlist = df_tldlist.rename(columns={'Punycode': 'punycode'})

if (df_iana['tld'].apply(lambda tld: tld.count('.') > 1)).sum() > 0:
    raise 'Unexpected: TLDs should have only one point each.'

# cleaning TLDs from points and special right-to-left character
df_iana['tld'] = df_iana.tld.str.replace('.', '', n=1, regex=False)
df_iana['tld'] = df_iana.tld.str.replace('\u200f', '', n=1, regex=False)
df_iana['tld'] = df_iana.tld.str.replace('\u200e', '', n=1, regex=False)

# converting Type labels to IANA naming convention
df_tldlist['Type'] = df_tldlist['Type'].str.replace('gTLD', 'generic', regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('ccTLD', 'country-code', regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('grTLD', 'generic-restricted',regex=False)
df_tldlist['Type'] = df_tldlist['Type'].str.replace('sTLD', 'sponsored', regex=False)


df = df_iana.merge(df_tldlist, left_on='tld', right_on='TLD', how='outer')

# check TLDs types are the same except for 'music' and pakistan-one TLD
if df[(~(df.Type == df['type']))].shape[0] > 2:
    print(df[(~(df.Type == df['type']))])
    raise 'Error: Types not coincided'

# check TLDs are the same except for 'music' TLD
iana_notin_tldlist = df[df.TLD.isna()].tld
if iana_notin_tldlist.shape[0] > 0:
    print(f'Warning: IANA has {iana_notin_tldlist.shape[0]} TLDs not contained in tldlist')
tldlist_notin_iana = df[df.tld.isna()].TLD
if tldlist_notin_iana.shape[0] > 0:
    print(f'Warning: tldlist has {tldlist_notin_iana.shape[0]} TLDs not contained in IANA')

# merging not shared TLDs
nans = df.tld.isna()
df['tld'].values[nans] = df[nans].TLD
df['type'].values[nans] = df[nans].Type
df['manager'].values[nans] = df[nans].Sponsor

# creating a column with TLD and punycode when TLD is in a not-english like language
# punicode_isna = df['punycode'].isna()
# df['punycode'].values[punicode_isna] = df[punicode_isna].tld

df_tld = df[['tld', 'punycode', 'type', 'manager']].copy()

df_tld[df_tld.tld == '嘉里大酒店']



Unnamed: 0,tld,punycode,type,manager
1553,嘉里大酒店,xn--w4r85el8fhu5dnra,generic,Kerry Trading Co. Limited


In [3]:
# getting the PSL list, considering the sections defined above, and parsing the comments.
import re


with open('public_suffix_list.dat', 'r') as f:
    psl_lines = [ l.replace('\n', '') for l in f.readlines()]


sections_delimiters = [
    '// ===BEGIN ICANN DOMAINS===',
    '// newGTLDs',
    '// ===BEGIN PRIVATE DOMAINS==='
]
sections_names = [
    'icann',
    'icann-new',
    'private-domains'
]

regex_punycode = r'^\/\/ (xn--.*?) .*$'
regex_comment = r'^\/\/ (?!Submitted)(.*?)(?: : )(.*?)$'

line_start = 1 + psl_lines.index('// ===BEGIN ICANN DOMAINS===')

sd = 0
manager = None
punycode = None
values = []
last_tld = ''
punycode_found = False
for i in range(line_start, len(psl_lines)):
    line = psl_lines[i]
    if len(line) == 0: continue
    if sd+1 < len(sections_delimiters) and line.find(sections_delimiters[sd+1]) == 0:
        sd += 1
    if line.find('//') == 0:
        punycode_match = re.match(regex_punycode, line)
        if punycode_match is not None:
            punycode_found = True
            punycode = punycode_match[1]
        else:
            first_comment_match = re.match(regex_comment, line)
            if first_comment_match is not None:
                manager = first_comment_match[1]
        continue
        
    tld = line
    tld = tld[tld.rfind('.')+1:]
    
    if punycode == 'xn--mgba3a4fra.ir':
        print(punycode_found, tld, f'last: {last_tld}', last_tld != tld and not punycode_found)
    
    if last_tld != tld and not punycode_found:
        punycode = None
    
    punycode_found = False
    
    values.append([ sections_names[sd], tld, punycode, line, manager ])
    
    last_tld = tld
    pass

df_etld = pd.DataFrame(values, columns=['type', 'tld', 'punycode', 'suffix', 'manager'])

df_etld = df_etld[['type', 'tld', 'punycode', 'suffix', 'manager']].reset_index()

True ir last: ir False
False is last: ir True


In [4]:
# the merge will be done with the tld column

df = df_etld.merge(df_tld, left_on='tld', right_on='tld', suffixes=['_etld', '_tld'], how='outer')

df['from_psl'] = ~df['type_etld'].isna()
df['from_iana'] = (~df['type_tld'].isna()) & (df['tld'] == df['suffix'])

if ((df['from_psl'] == False) & (df['from_iana'] == False)).sum() > 0:
    print('Warning: something is wrong')

df = df.reset_index(drop=True)

df['type'] = df['type_tld']

icann_pd = (df['type_etld'] == 'private-domains')
df['type'].values[icann_pd] = 'private-domains'


icann_pd = (df['type_tld'].isna() & df['punycode_etld'].isna())
df['type'].values[icann_pd] = 'other'

icann_pd = (df['type_tld'].isna() & (~df['punycode_etld'].isna()))
df['type'].values[icann_pd] = 'orphan-punycode'


df = df[['suffix', 'tld', 'punycode_tld', 'punycode_etld', 'from_psl', 'from_iana', 'type', 'type_tld', 'type_etld', 'manager_etld', 'manager_tld']]

suffix_na = df.suffix.isna()
if df[suffix_na].shape[0] > 0:
    print(f'Info: there are {df[suffix_na].shape[0]} NaN Suffixes')
df.suffix.values[suffix_na] = df.tld[suffix_na]

tld_na = df.tld.isna()
if df[tld_na].shape[0] > 0:
    print(f'Warning: there are {df[tld_na].shape[0]} NaN TLDs')

df = df.fillna('')

df.to_csv('tld_and_suffixes.csv')

df[df['from_psl'] ^ df['from_iana']].to_csv('differents.csv')

df_etld = df.copy()

Info: there are 106 NaN Suffixes


In [5]:
import time

times = {}
starts = {}
def tstart(label):
    starts[label] = time.time()
def tstop(label):
    if label not in times: times[label] = 0
    times[label] += time.time() - starts[label]

ds['tld'] = ds.dn.apply(lambda dn: dn[1 + dn.rfind('.'):])

# ds_label = ('@@.@@.@@.' + ds['dn']).str.rsplit('.', 3, expand=True).replace(r'@@(?:\.@@)*', None, regex=True)
# ds_label = ds_label[ds_label.columns[::-1]]

# df_label = ('@@.@@.@@.' + df['suffix']).str.rsplit('.', 3, expand=True).replace(r'@@(?:\.@@)*', None, regex=True)
# df_label = df_label[df_label.columns[::-1]]

df = df_etld.copy()

masks = {}
df['suffix2'] = '.' + df.suffix.str.replace(r'^\*.', '', regex=True)
df['sl'] = df.suffix.apply(len)

df = df.sort_values(by='sl', ascending=False)
df['isp'] = df.type == 'private-domains'
df = df[['suffix', 'tld', 'type', 'suffix2', 'sl', 'isp']]
df['labels'] = df.suffix2.str.count('\.')
df = df.reset_index()

df_pv = df[df.isp == True]
df_npvt = df[df.isp == False]
for tld in df.tld.drop_duplicates().values:
    masks[tld] = {
        'pvt': df[df.isp & (df.tld == tld)].copy(),
        'npvt': df[(~df.isp) & (df.tld == tld)].copy(),
    }


masks2 = {}
for tld in df.tld.drop_duplicates().values:
    masks2[tld] = df[df.tld == tld]
ok = []

ds[['suffix_npvt', 'suffix_pvt', 'type_npvt', 'type_pvt']] = ''

col_suffix2 = df.columns.to_list().index('suffix2')
col_index = df.columns.to_list().index('index')

col_suffix2, col_index

(4, 0)

In [6]:


df

Unnamed: 0,index,suffix,tld,type,suffix2,sl,isp,labels
0,864,s3.dualstack.ap-northeast-2.amazonaws.com,com,private-domains,.s3.dualstack.ap-northeast-2.amazonaws.com,41,True,5
1,866,s3.dualstack.ap-southeast-1.amazonaws.com,com,private-domains,.s3.dualstack.ap-southeast-1.amazonaws.com,41,True,5
2,867,s3.dualstack.ap-southeast-2.amazonaws.com,com,private-domains,.s3.dualstack.ap-southeast-2.amazonaws.com,41,True,5
3,863,s3.dualstack.ap-northeast-1.amazonaws.com,com,private-domains,.s3.dualstack.ap-northeast-1.amazonaws.com,41,True,5
4,884,s3-website.ap-northeast-2.amazonaws.com,com,private-domains,.s3-website.ap-northeast-2.amazonaws.com,39,True,4
...,...,...,...,...,...,...,...,...
9422,7074,st,st,country-code,.st,2,False,1
9423,7087,su,su,country-code,.su,2,False,1
9424,4319,ke,ke,country-code,.ke,2,False,1
9425,7140,sv,sv,country-code,.sv,2,False,1


In [7]:
if False:
    import numpy as np

    times = {}
    starts = {}

    time_avg = 0
    time_avg2 = 0

    sfx_pvt_len_avg = 0
    sfx_npvt_len_avg = 0


    for idx, row in ds.iterrows():
        dn = row.dn
        tld = row.tld

        if tld not in masks:
            ok.append(dn)
            continue

        dn_max_suffix_labels = dn.count('.') - 1

        for isp in [ 'pvt', 'npvt' ]:
            tstart(f'new {isp}')

            d = masks[tld][isp]

            rfind = [ dn.rfind(suffix2) != -1 for suffix2 in d.suffix2.values ]
            suff = d[rfind]

            if suff.shape[0] > 0:
                ds.at[idx, f'suffix_{isp}'] = suff.suffix.iloc[0]
                ds.at[idx, f'type_{isp}'] = suff.type.iloc[0]
            else:
                ds.at[idx, f'suffix_{isp}'] = None
                ds.at[idx, f'type_{isp}'] = None

        tstop(f'new {isp}')

        if idx > 0 and idx % 1_000 == 0:
            print(f'{idx}/{ds.shape[0]}')
            for l in times:
                print('%20s\t%f' % (l, times[l] / idx))
            print()

        pass

    ds.to_csv('/tmp/bibo/ds.csv')

1000/674898
            new npvt	0.001306

2000/674898
            new npvt	0.000831

3000/674898
            new npvt	0.000658

4000/674898
            new npvt	0.000576

5000/674898
            new npvt	0.000528

6000/674898
            new npvt	0.000486

7000/674898
            new npvt	0.000478

8000/674898
            new npvt	0.000594



KeyboardInterrupt: 

### Mask Boolean is not so fast
### Apply is slow compared with list comprehension
```

        tstart(f'get df {isp}')
        d = masks[tld][isp]
        tstop(f'get df {isp}')

        tstart(f'query {isp}')
        d.query('labels < @dn_max_suffix_labels')
        tstop(f'query {isp}')
        tstart(f'mask labels 1 {isp}')
        kk = d.labels <= dn_max_suffix_labels
        tstop(f'mask labels 1 {isp}')
        tstart(f'mask labels 2 {isp}')
        d = d[kk]
        tstop(f'mask labels 2 {isp}')
```

In [None]:

#     if idx > 0 and idx % 1000 == 0:
#         for l in times:
#             print('%10s\t%f' % (l, times[l] / idx))
#         print()   
#     tstart('all')
#     tstart('masks')
#     # remove unmatching suffixes
#     rfind = masks2[tld].suffix2.apply(lambda suffix2: dn.rfind(suffix2))
#     rfind = rfind[rfind != -1]
#     tstop('masks')
    
#     tstart('loc')
#     etlds = masks2[tld].loc[rfind.index]#.sort_values(by='sl', ascending=False) # series of etld
#     tstop('loc')
    
#     tstart('masks2.0')
    
#     if etlds.shape[0] == 1:
#         if etlds.iloc[0].isp:
#             suffix_npvt = etlds.iloc[0].suffix
#             type_npvt
    
#     tstart('masks2.1')
#     privates = etlds[etlds.isp]
#     tstop('masks2.1')
    
#     tstart('masks2.2')
#     n_privates = etlds[~etlds.isp]
#     tstop('masks2.2')
    
#     tstop('all')
        
    
#     tstart('setting')
#     start = time.time()
#     if n_privates.shape[0] > 0:
#         ds.at[idx, 'suffix_npvt'] = n_privates.suffix.iloc[0]
#         ds.at[idx, 'type_npvt'] = n_privates.type.iloc[0]
    
#     if privates.shape[0] > 0:
#         ds.at[idx, 'suffix_pvt'] = privates.suffix.iloc[0]
    
#     if rfind.shape[0] > 2:
#         print(f'Found {rfind.shape[0]} suffixes for {dn}')
#         if dn.find('blogspot') == -1:
#             etlds.to_csv(f'/tmp/bibo/{rfind.shape[0]}_{dn}.csv')
#     elif rfind.shape[0] == 0:
#         print(f'No etlds found for {dn}')
        
#     tstop('setting')

In [8]:
regexs = {} # nested dict, each level is a label, starting from the first from-the-right

sfx2 = df.suffix2.copy()
maxLabels_suffix = sfx2.str.count('\.').max()
# sfx2 = sfx2.str.split('.', expand=True)
# sfx2 = sfx2[sfx2.columns[::-1]]
# sfx2 = sfx2.apply(lambda row: )
# sfx2

sfx2 = sfx2.str[1:].apply(lambda s: ('@@.'*(maxLabels_suffix - s.count('.') - 1)) + s).str.split('.', expand=True)
sfx2 = sfx2[sfx2.columns[::-1]]
sfx2.replace('@@', '')
sfxPerLabels = sfx2.replace('@@', '').copy()
sfxPerLabels = sfxPerLabels.rename(columns={ 4:0, 3:1, 2:2, 1:3, 0:4})

def lop(sfx, l, labelsParent):
    if l not in sfx.columns:
        return None
    lSuffixes = sfx[l].drop_duplicates()
    if lSuffixes.shape[0] == 1 and lSuffixes.iloc[0] == '':
        return None
    print(labelsParent)
    labelsChild = []
    for _, lSuffix in sfx[l].drop_duplicates().iteritems():
        labelsChild.append([lSuffix])
        lop(sfx[sfx[l] == lSuffix], l+1, labelsChild[:-1])
    labelsParent.append(labelsChild)

labels2 = []
lop(sfxPerLabels, 0, labels2)

print(labels2)
# b = sfxPerLabels.groupby([0,1,2,3])[4].aggregate(list)

# display(b.loc[('com', 'amazonaws')])

# b.to_dict('index')
# for _, lSuffix0 in sfxPerLabels[0].drop_duplicates().iteritems():
#     labels[lSuffix0] = {}
#     sfxLabel0 = sfxPerLabels[sfxPerLabels[0] == lSuffix0]
#     for _, lSuffix1 in sfxLabel0[[0, 1]].drop_duplicates()[1].iteritems():
#         labels[lSuffix0][lSuffix1] = {}
#         sfxLabel01 = sfxLabel0[sfxLabel0[1] == lSuffix1]
#         for _, lSuffix2 in sfxLabel01[2].drop_duplicates().iteritems():
#             labels[lSuffix0][lSuffix1][lSuffix2] = {}
#             sfxLabel012 = sfxLabel01[sfxLabel01[2] == lSuffix2]
#             for _, lSuffix3 in sfxLabel012[3].drop_duplicates().iteritems():
#                 labels[lSuffix0][lSuffix1][lSuffix2][lSuffix3] = {}
#                 print(lSuffix3)
#             print(lSuffix2)
#         print(lSuffix1)
#     print(lSuffix0)
        
# labels

# {level: b.xs(level).to_dict(orient='index') for level in b.index.levels[0]}

# def nest(d: dict) -> dict:
#     print(d)
#     result = {}
#     for key, value in d.items():
#         target = result
#         for k in key[:-1]:  # traverse all keys but the last
#             target = target.setdefault(k, {})
#         target[key[-1]] = value
#     return result

# def df_to_nested_dict(df: pd.DataFrame) -> dict:
#     d = df.to_dict(orient='index')
#     return {k: nest(v) for k, v in d.items()}

# df_to_nested_dict(b.to_frame())

# b.to_frame().to_dict(orient='index')

[]
[]
[]
[]
[]
[['ap-northeast-2']]
[]
[['ap-northeast-2'], ['ap-southeast-1']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2'], ['ap-northeast-1']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2'], ['ap-northeast-1'], ['ca-central-1']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2'], ['ap-northeast-1'], ['ca-central-1'], ['eu-central-1'], ['s3-website-ap-northeast-1'], ['s3-website-ap-southeast-2'], ['s3-website-ap-southeast-1']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2'], ['ap-northeast-1'], ['ca-central-1'], ['eu-central-1'], ['s3-website-ap-northeast-1'], ['s3-website-ap-southeast-2'], ['s3-website-ap-southeast-1'], ['ap-south-1']]
[]
[['ap-northeast-2'], ['ap-southeast-1'], ['ap-southeast-2'], ['ap-northeast-1'], ['ca-central-1'], ['eu-central-1'], ['s3-website-ap-northeast-1'], ['s3-website-ap-southeast-2'], ['s3-website-ap-southeast-1'], ['ap-south-1'],

[['com'], ['museum']]
[['com'], ['museum'], ['fr']]
[]
[]
[]
[['eb']]
[['com'], ['museum'], ['fr'], ['cn']]
[]
[]
[]
[['com'], ['campinagrande'], ['saobernardo'], ['santoandre'], ['santamaria'], ['aparecida'], ['riobranco'], ['joinville'], ['boavista'], ['campinas'], ['contagem'], ['curitiba'], ['saogonca'], ['salvador'], ['riopreto'], ['ribeirao'], ['sorocaba'], ['londrina'], ['barueri'], ['niteroi'], ['maringa'], ['goiania'], ['floripa']]
[['com'], ['campinagrande'], ['saobernardo'], ['santoandre'], ['santamaria'], ['aparecida'], ['riobranco'], ['joinville'], ['boavista'], ['campinas'], ['contagem'], ['curitiba'], ['saogonca'], ['salvador'], ['riopreto'], ['ribeirao'], ['sorocaba'], ['londrina'], ['barueri'], ['niteroi'], ['maringa'], ['goiania'], ['floripa'], ['gov'], ['fortal'], ['cuiaba'], ['caxias'], ['9guacu']]
[['com'], ['museum'], ['fr'], ['cn'], ['br']]
[]
[['azurestaticapps'], ['privatizehealthinsurance']]
[['azurestaticapps'], ['privatizehealthinsurance'], ['saveincloud']]


[['hiroshima'], ['yamanashi'], ['miyagi'], ['saitama'], ['fukushima'], ['kagoshima'], ['kanagawa'], ['tokushima'], ['osaka'], ['wakayama'], ['okinawa'], ['hokkaido']]
[['hiroshima'], ['yamanashi'], ['miyagi'], ['saitama'], ['fukushima'], ['kagoshima'], ['kanagawa'], ['tokushima'], ['osaka'], ['wakayama'], ['okinawa'], ['hokkaido'], ['gunma']]
[['hiroshima'], ['yamanashi'], ['miyagi'], ['saitama'], ['fukushima'], ['kagoshima'], ['kanagawa'], ['tokushima'], ['osaka'], ['wakayama'], ['okinawa'], ['hokkaido'], ['gunma'], ['gifu']]
[['hiroshima'], ['yamanashi'], ['miyagi'], ['saitama'], ['fukushima'], ['kagoshima'], ['kanagawa'], ['tokushima'], ['osaka'], ['wakayama'], ['okinawa'], ['hokkaido'], ['gunma'], ['gifu'], ['yamaguchi']]
[['hiroshima'], ['yamanashi'], ['miyagi'], ['saitama'], ['fukushima'], ['kagoshima'], ['kanagawa'], ['tokushima'], ['osaka'], ['wakayama'], ['okinawa'], ['hokkaido'], ['gunma'], ['gifu'], ['yamaguchi'], ['kyoto']]
[['hiroshima'], ['yamanashi'], ['miyagi'], ['saita

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp']]
[]
[]
[['testing']]
[['testing'], ['disrec']]
[['testing'], ['disrec'], ['dev']]
[['thingdust']]
[['thingdust'], ['banzaicloud']]
[['thingdust'], ['banzaicloud'], ['resinstaging']]
[['thingdust'], ['banzaicloud'], ['resinstaging'], ['unispace'], ['browsersafetymark']]
[['thingdust'], ['banzaicloud'], ['resinstaging'], ['unispace'], ['browsersafetymark'], ['beebyteapp'], ['azurecontainer']]
[['thingdust'], ['banzaicloud'], ['resinstaging'], ['unispace'], ['browsersafetymark'], ['beebyteapp'], ['azurecontainer'], ['dappnode']]
[['thingdust'], ['banzaicloud'], ['resinstaging'], ['unispace'], ['browsersafetymark'], ['beebyteapp'], ['azurecontainer'], ['dappnode'], ['nodeart'], ['virtualserver'], ['backplaneapp']]
[['thingdust'], ['banzaicloud'], ['resinstaging'], ['unispace'], ['browsersafetymark'], ['beebyteapp'], ['azurecontainer'], ['dappnode'], ['nodeart'], ['virtualserver'], ['backplaneapp'], ['forgerock'], ['ho

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it']]
[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info']]
[['independent-commission'], ['independent-inquiry']]
[['independent-commission'], ['independent-inquiry'], ['org'], ['independent-inquest'], ['independent-review']]
[]
[['retrosnub'], ['wellbeingzone']]
[['retrosnub'], ['wellbeingzone'], ['layershift'], ['myspreadshop'], ['barsyonline']]
[['independent-commission'], ['independent-inquiry'], ['org'], ['independent-inquest'], ['independent-review'], ['co'], ['independent-panel'], ['royal-commission']]
[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk']]
[]
[['møre-og-romsdal']]
[['møre-og-romsdal'], ['more-og-romsdal'], ['stjordalshalsen'], ['stjørdalshalsen'], ['evje-og-hornnes'], ['aurskog-holand'], ['aurskog-hølan

[['møre-og-romsdal'], ['more-og-romsdal'], ['stjordalshalsen'], ['stjørdalshalsen'], ['evje-og-hornnes'], ['aurskog-holand'], ['aurskog-høland'], ['nordland'], ['midtre-gauldal'], ['naamesjevuemie'], ['nååmesjevuemie'], ['vestfold'], ['hedmark'], ['giehtavuoatna'], ['østfold'], ['ostfold'], ['divttasvuotna'], ['vestre-slidre'], ['oystre-slidre'], ['øystre-slidre'], ['skedsmokorset'], ['guovdageaidnu'], ['nore-og-uvdal'], ['matta-varjjat'], ['mátta-várjjat'], ['dielddanuorri'], ['vestre-toten'], ['myspreadshop'], ['divtasvuodna'], ['krokstadelva'], ['nesoddtangen'], ['sandnessjoen'], ['sandnessjøen'], ['bahccavuotna'], ['báhccavuotna'], ['jan-mayen'], ['akershus'], ['buskerud'], ['hordaland'], ['sor-varanger'], ['sør-varanger'], ['hammarfeasta'], ['hámmárfeasta'], ['hattfjelldal'], ['kristiansand'], ['kristiansund'], ['laakesvuemie'], ['vossevangen'], ['svalbard'], ['fredrikstad'], ['brønnøysund'], ['bronnoysund'], ['bearalvahki'], ['nord-aurdal'], ['davvenjarga'], ['davvenjárga'], ['no

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me']]
[['kommunalforbund'], ['naturbruksgymn'], ['myspreadshop'], ['itcouldbewor'], ['blogspot']]
[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se']]
[]
[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual']]
[['ondigitalocean'], ['northflank'], ['onflashdrive'], ['edgecompute'], ['developer'],

[['ma']]
[['ma'], ['mi'], ['stuff-4-sale']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud'], ['freeddns'], ['platterp'], ['pointto'], ['golffan'], ['graphox'], ['cloudns']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud'], ['freeddns'], ['platterp'], ['pointto'], ['golffan'], ['graphox'], ['cloudns'], ['sc']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud'], ['freeddns'], ['platterp'], ['pointto'], ['golffan'], ['graphox'], ['cloudns'], ['sc'], ['me']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud'], ['freeddns'], ['platterp'], ['pointto'], ['golffan'], ['graphox'], ['cloudns'], ['sc'], ['me'], ['mn']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud'], ['freeddns'], ['platterp'], ['pointto'], ['golffan'], ['graphox'], ['cloudns'], ['sc'], ['me'], ['mn'], ['mo']]
[['ma'], ['mi'], ['stuff-4-sale'], ['enscaled'], ['land-4-sale'], ['mircloud

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca']]
[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'], 

[[['com'], ['museum'], ['fr'], ['cn'], ['br'], ['net'], ['cloud'], ['jp'], ['io'], ['org'], ['be'], ['aero'], ['it'], ['info'], ['uk'], ['no'], ['de'], ['ru'], ['digital'], ['cz'], ['to'], ['kz'], ['at'], ['au'], ['su'], ['cy'], ['fi'], ['systems'], ['me'], ['se'], ['eu'], ['northwesternmutual'], ['app'], ['goog'], ['nl'], ['pl'], ['builders'], ['today'], ['services'], ['cc'], ['nu'], ['ua'], ['travelersinsurance'], ['site'], ['one'], ['solutions'], ['online'], ['edu'], ['hosting'], ['vermögensberatung'], ['land'], ['management'], ['ch'], ['community'], ['dev'], ['earth'], ['academy'], ['us'], ['vermögensberater'], ['pro'], ['estate'], ['bo'], ['host'], ['scot'], ['il'], ['news'], ['network'], ['page'], ['co'], ['ar'], ['kerryproperties'], ['mt'], ['rs'], ['group'], ['name'], ['ng'], ['by'], ['ca'], ['ie'], ['run'], ['americanexpress'], ['uy'], ['rocks'], ['sandvikcoromant'], ['dk'], ['store'], ['tr'], ['es'], ['eg'], ['sh'], ['ee'], ['gg'], ['tw'], ['eus'], ['bananarepublic'], ['za'],

In [9]:
sfx2 = df.suffix2.copy()
maxLabels_suffix = sfx2.str.count('\.').max()

sfx2 = sfx2.str[1:].apply(lambda s: ('@@.'*(maxLabels_suffix - s.count('.') - 1)) + s).str.split('.', expand=True)
sfx2 = sfx2[sfx2.columns[::-1]]
sfx2.replace('@@', '')
sfxPerLabels = sfx2.replace('@@', '').copy()
sfxPerLabels = sfxPerLabels.rename(columns={ 4:0, 3:1, 2:2, 1:3, 0:4})

sfxPerLabels['id'] = df['index']


# import pprint

# p = pprint.PrettyPrinter(indent=2)
# p.pprint(labels)



In [None]:
cur = [ '', '', '', '' ]
for _, suffix in sfxaws.iterrows():
    if cur[l] != sfxaws[l]:
        s += group
    

In [360]:
class Node:
    def __init__(self, label, deep=0, index=None, fl=False, parent=None, dn=None):
        self.parent = parent
        self.label = label
        self.index = index
        self.dn = dn
        self.children = []
        self.deep = deep
        self.fl = fl
        self.indent = '\n' + (self._indent() * '  ')
        pass
    
    def _indent(self):
        i = 0 if self.parent is None else self.parent._indent() + 1 + self.fl
        return i
    
    def add(self, label, index=None, fl=False, dn=None):
        childDeep = self.deep+1 if not self.fl else self.deep
        node = Node(label, index=index, deep=childDeep, fl=fl, parent=self, dn=dn)
        self.children.append(node)
        return node
    
    def leaves(self):
        _leafs = [self] if self.index is not None else []
        for child in self.children:
            _leafs += child.leaves()
        return _leafs
    
    def allLeaf(self):
        return all([child.isLeaf() for child in self.children])
    
    def isLeaf(self):
        return self.index is not None
    
    def branch(self):
        if self.parent is None:
            return [ self ]
        return [ self ] + self.parent.branch()
    
    def __getitem__(self, key):
        return self.children[key]
    
    def compact(self):
        while not self.fl and len(self.children) == 1 and not self.isLeaf():
            self.label = self.label + '.' + self[0].label
            self.index = self[0].index
            self.dn = self[0].dn
            self.children = self[0].children
    
    def _print(self):
        while not self.fl and len(self.children) == 1 and not self.isLeaf():
            self.label = self.label + '.' + self[0].label
            self.index = self[0].index
            self.dn = self[0].dn
            self.children = self[0].children
        print(self.indent + str(self))
        for child in self.children:
            child._print()
    
    def toRegex(self):
        groups = []
        
        regex = ''
        for child in self.children:
            groups.append(child.toRegex())
        
        label = self.label[1:] if self.parent.fl else self.label
        if len(groups) > 0:
            bs = "" if self.fl else "\\."
            regex = f'(?:{label}{bs}(' + '|'.join(groups) + f'{self.indent}))'
            if self.isLeaf():
                regex = f'(' + regex + f'|(?P<l{self.index}>{self.label}))'
        else:
            regex = f'(?P<l{self.index}>{label})'
        return self.indent + regex # + '.*' if self.parent is None else ''
    
    def __str__(self):
        if self.fl:
            return 'FL/' + self.label[0]
        else:
            return f'{self.label}#{len(self.children)}' + f'[{self.dn}]'
    
    def __repr__(self):
        if self.fl:
            return 'FL/' + self.label[0]
        else:
            return f'{self.label}#{len(self.children)}' + f'[{self.dn}]'
    
    pass

class Branch:
    def __init__(self, letter, root):
        self.letter = letter
        self.root = root
    
def lop(sfxOr, l, parent):
    subcols = list(range(l+1, maxLabels_suffix))
    subcols.append('id')
    currentLabelUniques = sfxOr.drop_duplicates(subset=l)[l]
    firstLetters = currentLabelUniques.str[0].drop_duplicates().fillna('')
    
    if firstLetters.shape[0] > 1:
        flNodes = { fl: parent.add(fl, fl=True) for fl in firstLetters}
    else:
        flNodes = { fl: parent for fl in firstLetters}
    
#     print(currentLabelUniques)
#     print(firstLetters)
#     print(flNodes)
#     print()
    
    for fl in firstLetters:
        sfx = sfxOr[sfxOr[l].str[0] == fl]
        
            
        if l == 4:
            s_leaves = sfx
            s_branches = pd.DataFrame([])
        else:
            s_leaves = sfx[sfx[l+1] == '']
            s_branches = sfx[sfx[l+1] != ''][l].drop_duplicates()
        
        leaves = {}
        for _, leaf in s_leaves.iterrows():
            print(leaf[l])
            leaves[leaf[l]] = flNodes[leaf[l][0]].add(leaf[l], index=leaf['id'], dn='.'.join(leaf.values[:l+1]))
            _
#         print(f'{fl} {l} leaves:\t', parent, leaves)
#         print(f'{fl} {l} branches:\t', s_branches.values)
            
        for loc, branch in s_branches.iteritems():
            if branch not in leaves:
                node = parent.add(branch)
            else:
                node = leaves[branch]
#             print(l, '//', branch, node)
            if (sfx[l+1] != '').sum() > 0:
                lop(sfx[(sfx[l] == branch) & (sfx[l+1] != '')], l+1, node)
        pass
sfx = sfxPerLabels[(sfxPerLabels[0] == 'com')]
sfx = sfx[(sfx[1] == 'amazonaws') | (sfx[1] == '')]
tree = Node('root', deep=-1)
treeFL = lop(sfx, 0, tree)

# tree[0]._print()

print(tree[0].toRegex())

sfxPerLabels[(sfxPerLabels[0] == 'com')][[0,1]].value_counts()

com
s3
s3-website
s3
s3
s3
s3
s3
s3-website
s3
compute-1
compute
s3
s3-website
s3
elb
s3
s3-website
s3
s3
s3
s3-website
s3
s3
s3-website
s3
s3-website-ap-northeast-1
s3-website-ap-southeast-2
s3-website-ap-southeast-1
s3-fips-us-gov-west-1
s3-website-us-east-1
s3-website-us-west-1
s3-website-us-west-2
s3-website-eu-west-1
s3-website-sa-east-1
s3-ap-northeast-1
s3-ap-southeast-1
s3-ap-southeast-2
s3-ap-northeast-2
s3-us-gov-west-1
s3-ca-central-1
s3-eu-central-1
s3-ap-south-1
s3-external-1
s3-eu-west-1
s3-eu-west-2
s3-eu-west-3
s3-sa-east-1
s3-us-west-1
s3-us-west-2
s3-us-east-2
s3
s3
us-east-1
s3
s3-website
s3
s3

  ((?:com\.(
    (?:amazonaws\.(
        (?P<lNone>a)|
        (?:c(
          (?P<l816>ompute-1)|
          (?P<l815>ompute)
        ))|
        (?:e(
          (?P<l836>lb)
        ))|
        (?:s(
          (?P<l879>3-website-ap-northeast-1)|
          (?P<l881>3-website-ap-southeast-2)|
          (?P<l880>3-website-ap-southeast-1)|
          (?P<l850>3-fips-us-gov-west-1

0    1               
com  amazonaws           57
     elasticbeanstalk    18
     mythic-beasts       11
     evennode             8
     customer-oci         4
                         ..
     from-nd              1
     from-nc              1
     from-mt              1
     from-ms              1
     za                   1
Length: 389, dtype: int64

In [299]:
class NodeNoFL:
    def __init__(self, label, deep=0, index=None, parent=None):
        self.parent = parent
        self.label = label
        self.index = index
        self.children = []
        self.deep = deep
        self.indent = '\n' + (self._indent() * '\t')
        pass
    
    def _indent(self):
        i = 0 if self.parent is None else self.parent._indent() + 1
        return i
    
    def add(self, label, index=None):
        node = NodeNoFL(label, index=index, deep=self.deep+1, parent=self)
        self.children.append(node)
        return node
    
    def leaves(self):
        _leafs = [self] if self.index is not None else []
        for child in self.children:
            _leafs += child.leaves()
        return _leafs
    
    def allLeaf(self):
        return all([child.isLeaf() for child in self.children])
    
    def isLeaf(self):
        return self.index is not None
    
    def branch(self):
        if self.parent is None:
            return [ self ]
        return [ self ] + self.parent.branch()
    
    def __getitem__(self, key):
        return self.children[key]
    
    
    def toRegex(self):
        groups = []
        
        
        regex = ''
        for child in self.children:
            groups.append(child.toRegex())
        
        if len(groups) > 0:
            regex = f'(?:{self.label}\\.(' + '|'.join(groups) + f'{self.indent}))'
            if self.isLeaf():
                regex = f'(' + regex + f'|(?P<l{self.index}>{self.label}))'
        else:
            regex = f'(?P<l{self.index}>{self.label})'
        return self.indent + regex
    
    def __str__(self):
        return f'{self.label}#{len(self.children)}'
    
    def __repr__(self):
        return f'{self.label}#{len(self.children)}' + f'[{self.index}]'
    
    pass

class Branch:
    def __init__(self, letter, root):
        self.letter = letter
        self.root = root
    
def lopNoFL(sfx, l, parent):
    subcols = list(range(l+1, maxLabels_suffix))
    subcols.append('id')
    currentLabelUniques = sfx.drop_duplicates(subset=l)[l]
    for _id, uniqueLabel in currentLabelUniques.iteritems():
        sfxNew = sfx[sfx[l] == uniqueLabel]
        if sfxNew.shape[0] == 1:
            row = sfxNew.iloc[0].values[l:-1]
            row = [ kk for kk in row if kk != '']
            row = '\\.'.join(row)
            if row != '':
                parent.add(row, sfxNew.iloc[0]['id'])
            else:
                parent.index = sfxNew.iloc[0]['id']
        else:
            node = parent.add(uniqueLabel)
            lopNoFL(sfxNew, l+1, node)
    return parent
# (sfxPerLabels[2] == 'ap-northeast-2') | ((sfxPerLabels[2] == 'us-east-1')) &
# (sfxPerLabels[1] == 'amazonaws') & 
amazon = sfxPerLabels[(sfxPerLabels[0] == 'com')].sort_values(by=[0,1,2,3,4])
treeNoFL = NodeNoFL('root', deep=-1)
lopNoFL(amazon, 0, treeNoFL)

root#1[None]

In [353]:
import time

sfx = sfxPerLabels[sfxPerLabels[0] == 'com'].sort_values(by=[0,1,2,3,4])

tree = Node('root', deep=-1)
lop(sfx, 0, tree)

treeNoFL = NodeNoFL('root', deep=-1)
treeFL = lopNoFL(sfx, 0, treeNoFL)

def test(tree, test_str):
    reg = re.compile(tree[0].toRegex(), re.VERBOSE)
    start = time.time()
    matches = reg.match(test_str)
    gd = matches.groupdict()
    idx = [ g for g in gd if gd[g] is not None ][0]
    t = time.time() - start
    l = sfx.set_index('id').loc[int(idx[1:])]
    print(t, l.values, idx[1:])

dn = 'com.plp.ap-northeast-2.s3'

test(treeNoFL, dn)
test(tree, dn)

sfx

8950    com
Name: 0, dtype: object
{'c': root#0[None]}
4531            001www
4541              0emm
5076             1kapp
2939        3utilities
6782                4u
             ...      
3728           wpmucdn
1225    writesthisblog
5074             xnbay
3305          yolasite
6773                za
Name: 1, Length: 388, dtype: object
{'0': FL/0, '1': FL/1, '3': FL/3, '4': FL/4, 'a': FL/a, 'b': FL/b, 'c': FL/c, 'd': FL/d, 'e': FL/e, 'f': FL/f, 'g': FL/g, 'h': FL/h, 'i': FL/i, 'j': FL/j, 'k': FL/k, 'l': FL/l, 'm': FL/m, 'n': FL/n, 'o': FL/o, 'p': FL/p, 'q': FL/q, 'r': FL/r, 's': FL/s, 't': FL/t, 'u': FL/u, 'v': FL/v, 'w': FL/w, 'x': FL/x, 'y': FL/y, 'z': FL/z}
219    dev
Name: 2, dtype: object
{'d': adobeaemcloud#0[com.adobeaemcloud]}
3                  ap-northeast-1
0                  ap-northeast-2
10                     ap-south-1
1                  ap-southeast-1
2                  ap-southeast-2
5                    ca-central-1
220                       compute
140        

error: redefinition of group name 'lNone' as group 496; was group 453 at position 14302 (line 530, column 15)