In [1]:
fnames = [a.replace("\n", "") for a in open("whois_tld_ls", "r").readlines()]

In [2]:
import pickle
from collections import Counter

c = pickle.load(open("fields.pickle", "rb"))

In [3]:
import re
fields = [re.escape(a[0].strip()) for a in c.most_common(500)]

In [4]:
regex_str = "^\s*({})[\t ]*:\s*(.*)".format("|".join(fields))
regex = re.compile(regex_str, re.MULTILINE)

In [5]:
def get_vector(text):
    features = {a[0]: a[1] for a in regex.findall(text)}
    return features

In [6]:
import tqdm
import os
def get_all_features(force_calc=False):
    if not force_calc:
        if os.path.exists("all_features.pickle"):
            return pickle.load(open("all_features.pickle", "rb"))
    all_features = {}
    for name in tqdm.tqdm(fnames):
        try:
            all_features[name.replace(".whoistld", "")] = get_vector(open("whois_tld/" + name, "r").read())
        except:
            pass
    pickle.dump(all_features, open("all_features.pickle", "wb"))
    return all_features

all_features = get_all_features(force_calc=False)


In [7]:
len(list(all_features.keys())) / len(fnames) * 100

99.98191089825312

In [8]:
# pickle.dump(all_features, open("all_features.pickle", "wb"))

In [9]:
all_features["google.com"]

{'Domain Name': 'GOOGLE.COM',
 'Registry Domain ID': '2138514_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.markmonitor.com',
 'Registrar URL': 'http://www.markmonitor.com',
 'Updated Date': '2019-09-09T15:39:04Z',
 'Creation Date': '1997-09-15T04:00:00Z',
 'Registry Expiry Date': '2028-09-14T04:00:00Z',
 'Registrar': 'MarkMonitor Inc.',
 'Registrar IANA ID': '292',
 'Registrar Abuse Contact Email': 'abusecomplaints@markmonitor.com',
 'Registrar Abuse Contact Phone': '+1.2083895740',
 'Domain Status': 'serverUpdateProhibited https://icann.org/epp#serverUpdateProhibited',
 'Name Server': 'NS4.GOOGLE.COM',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T03:02:05Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support the transmission of

In [10]:
from fuzzywuzzy import fuzz

def diff_score(vec1, vec2):
    key1 = set(vec1.keys())
    key2 = set(vec2.keys())
    
    sym_diff_score = len(key1 ^ key2)
    common = key1 & key2
    
    common_diffs = [1 - fuzz.ratio(vec1[a], vec2[a]) / 100 for a in list(common)]
    common_diff_score = sum(common_diffs)
    
    total_score = common_diff_score + sym_diff_score
    
    norm = len(key1 | key2)
    
    return common_diff_score / len(common)

In [11]:
diff_score(all_features["github.com"], all_features["google.com"])

0.16052631578947368

In [12]:
all_features["facebook.com"]

{'Domain Name': 'FACEBOOK.COM',
 'Registry Domain ID': '2320948_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.registrarsafe.com',
 'Registrar URL': 'https://www.registrarsafe.com',
 'Updated Date': '2020-03-10T18:53:59Z',
 'Creation Date': '1997-03-29T05:00:00Z',
 'Registry Expiry Date': '2028-03-30T04:00:00Z',
 'Registrar': 'RegistrarSafe, LLC',
 'Registrar IANA ID': '3237',
 'Registrar Abuse Contact Email': 'abusecomplaints@registrarsafe.com',
 'Registrar Abuse Contact Phone': '+1.6503087004',
 'Domain Status': 'serverUpdateProhibited https://www.icann.org/epp#serverUpdateProhibited',
 'Name Server': 'D.NS.FACEBOOK.COM',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T05:03:46Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support 

In [13]:
all_features["yahoo.com"]

{'Domain Name': 'yahoo.com',
 'Registry Domain ID': '3643624_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.markmonitor.com',
 'Registrar URL': 'http://www.markmonitor.com',
 'Updated Date': '2019-12-18T05:45:43-0800',
 'Creation Date': '1995-01-18T00:00:00-0800',
 'Registry Expiry Date': '2023-01-19T05:00:00Z',
 'Registrar': 'MarkMonitor, Inc.',
 'Registrar IANA ID': '292',
 'Registrar Abuse Contact Email': 'abusecomplaints@markmonitor.com',
 'Registrar Abuse Contact Phone': '+1.2083895770',
 'Domain Status': 'serverDeleteProhibited (https://www.icann.org/epp#serverDeleteProhibited)',
 'Name Server': 'ns4.yahoo.com',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T00:00:36Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support the tr

In [14]:
all_features["github.com"]

{'Domain Name': 'GITHUB.COM',
 'Registry Domain ID': '1264983250_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.markmonitor.com',
 'Registrar URL': 'http://www.markmonitor.com',
 'Updated Date': '2020-06-23T14:04:50Z',
 'Creation Date': '2007-10-09T18:20:50Z',
 'Registry Expiry Date': '2020-10-09T18:20:50Z',
 'Registrar': 'MarkMonitor Inc.',
 'Registrar IANA ID': '292',
 'Registrar Abuse Contact Email': 'abusecomplaints@markmonitor.com',
 'Registrar Abuse Contact Phone': '+1.2083895740',
 'Domain Status': 'clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited',
 'Name Server': 'NS4.P16.DYNECT.NET',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T01:43:11Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support the transmis

In [15]:
all_features["google.com"]

{'Domain Name': 'GOOGLE.COM',
 'Registry Domain ID': '2138514_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.markmonitor.com',
 'Registrar URL': 'http://www.markmonitor.com',
 'Updated Date': '2019-09-09T15:39:04Z',
 'Creation Date': '1997-09-15T04:00:00Z',
 'Registry Expiry Date': '2028-09-14T04:00:00Z',
 'Registrar': 'MarkMonitor Inc.',
 'Registrar IANA ID': '292',
 'Registrar Abuse Contact Email': 'abusecomplaints@markmonitor.com',
 'Registrar Abuse Contact Phone': '+1.2083895740',
 'Domain Status': 'serverUpdateProhibited https://icann.org/epp#serverUpdateProhibited',
 'Name Server': 'NS4.GOOGLE.COM',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T03:02:05Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support the transmission of

In [16]:
all_features["microsoft.com"]

{'Domain Name': 'MICROSOFT.COM',
 'Registry Domain ID': '2724960_DOMAIN_COM-VRSN',
 'Registrar WHOIS Server': 'whois.markmonitor.com',
 'Registrar URL': 'http://www.markmonitor.com',
 'Updated Date': '2020-05-20T19:54:16Z',
 'Creation Date': '1991-05-02T04:00:00Z',
 'Registry Expiry Date': '2021-05-03T04:00:00Z',
 'Registrar': 'MarkMonitor Inc.',
 'Registrar IANA ID': '292',
 'Registrar Abuse Contact Email': 'abusecomplaints@markmonitor.com',
 'Registrar Abuse Contact Phone': '+1.2083895740',
 'Domain Status': 'serverUpdateProhibited https://icann.org/epp#serverUpdateProhibited',
 'Name Server': 'NS4-205.AZURE-DNS.INFO',
 'DNSSEC': 'unsigned',
 '>>> Last update of whois database': '2020-07-07T05:36:57Z <<<',
 'NOTICE': 'The expiration date displayed in this record is the date the',
 'TERMS OF USE': 'You are not authorized to access or query our Whois',
 'by the following terms of use': 'You agree that you may use this Data only',
 'to': '(1) allow, enable, or otherwise support the tran

In [None]:
# TODO: Classifier for 90% - 10% 

# TODO: 67k nodes for which we have reg email, check for how many pairs of san relations we have the registrant email
# for both the nodes. For those pairs check how many of them do not have the same 3rd party hosting provider
# (Alan's BGLL code)