In [1]:
!pip install tldextract dnspython python-whois cymruwhois

Collecting tldextract
  Downloading tldextract-3.3.1-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.6/93.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython
  Downloading dnspython-2.2.1-py3-none-any.whl (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting python-whois
  Downloading python-whois-0.8.0.tar.gz (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.6/109.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting cymruwhois
  Downloading cymruwhois-1.6-py3-none-any.whl (6.2 kB)
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Collecting filelock>=3.0.8
  Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[2K   

In [2]:
from cymruwhois import Client
import dns.resolver
import datetime
import ipaddress
import os
import pandas as pd
import re
import requests
import string
import tldextract
from urllib.parse import urlparse, parse_qs
import whois

In [3]:
def count_chars(s):
    return {c:s.count(c) for c in list(string.printable)}

In [4]:
def build_counts(char_dict, t):
    d = dict()
    d['qty_dot_'+t] = char_dict['.']
    d['qty_hyphen_'+t] = char_dict['-']
    d['qty_underline_'+t] = char_dict['_']
    d['qty_slash_'+t] = char_dict['/']
    d['qty_questionmark_'+t] = char_dict['?']
    d['qty_equal_'+t] = char_dict['=']
    d['qty_at_'+t] = char_dict['@']
    d['qty_and_'+t] = char_dict['&']
    d['qty_exclamation_'+t] = char_dict['!']
    d['qty_space_'+t] = char_dict[' ']
    d['qty_tilde_'+t] = char_dict['~']
    d['qty_comma_'+t] = char_dict[',']
    d['qty_plus_'+t] = char_dict['+']
    d['qty_asterisk_'+t] = char_dict['*']
    d['qty_hashtag_'+t] = char_dict['#']
    d['qty_dollar_'+t] = char_dict['$']
    d['qty_percent_'+t] = char_dict['%']
    return d

In [5]:
def table_1_features(full_url, counts):
    # https://pypi.org/project/tldextract/
    counts['qty_tld_url'] = len(tldextract.extract(full_url).suffix)
    counts['length_url'] = len(full_url)
    # https://stackoverflow.com/questions/17681670/extract-email-sub-strings-from-large-document
    exp = r'(?:\.?)([\w\-_+#~!$&\'\.]+(?<!\.)(@|[ ]?\(?[ ]?(at|AT)[ ]?\)?[ ]?)(?<!\.)[\w]+[\w\-\.]*\.[a-zA-Z-]{2,3})(?:[^\w])'
    counts['email_in_url'] = bool(re.search(exp, full_url))
    
    return counts

In [6]:
def table_2_features(full_url, counts):
    d = tldextract.extract(test_url).domain
    counts['qty_vowels_domain'] = len([v for v in d.lower() if v in list('aeiou')])
    counts['domain_length'] = len(d)
    try:
        # https://docs.python.org/3/library/ipaddress.html
        ipaddress.ip_address(d)
        counts['domain_in_ip'] = True
    except ValueError:
        counts['domain_in_ip'] = False
    counts['server_client_domain'] = ('server' in d) or ('client' in d)
    return counts

In [97]:
def table_3_features(full_url, counts):
    # https://docs.python.org/3/library/urllib.parse.html
    d = os.path.split(urlparse(full_url).path)[0].lstrip('/')
    counts['directory_length'] = len(d)
    if len(d)==0:
        counts = {a: -1 for a in counts}
    return counts

In [85]:
def table_4_features(full_url, counts):
    f = os.path.split(urlparse(full_url).path)[1]
    counts['file_length'] = len(f)
    if len(f)==0:
        counts = {a: -1 for a in counts}
    return counts

In [78]:
def table_5_features(full_url, counts):
    q = urlparse(full_url).query
    counts['params_length'] = len(q)
    counts['tld_present_params'] = True if tldextract.extract(q).suffix else False
    counts['qty_params'] = len(parse_qs(q))
    if len(q)==0:
        counts = {a: -1 for a in counts}
    return counts

In [79]:
def table_6_features(full_url):
    # https://www.dnspython.org/examples.html
    # TODO: -1 if these tests do not resolve
    u = tldextract.extract(full_url)
    d = u.domain + '.' + u.suffix
    who = whois.whois(d)
    resp = requests.get('http://google.com')
    features = dict()
    features['time_response'] = dns.resolver.resolve(d).response.time * 1000
    # https://support.mailessentials.gfi.com/hc/en-us/articles/360015116520-How-to-check-and-read-a-Sender-Policy-Framework-record-for-a-domain
    features['domain_spf'] = 'spf' in str(dns.resolver.resolve(d, 'TXT').rrset)
    # https://github.com/JustinAzoff/python-cymruwhois
    cli=Client()
    res=cli.lookup(dns.resolver.resolve(d)[0].to_text())
    features['asn_ip'] = res.asn
    features['time_domain_activation'] = (datetime.datetime.now() - who['creation_date'][0]).days
    features['time_domain_expiration'] = (who['expiration_date'][0] - datetime.datetime.now()).days
    features['qty_ip_resolved'] = len(dns.resolver.resolve(d, 'A'))
    features['qty_nameservers'] = len(dns.resolver.resolve(d, 'NS'))
    features['qty_mx_servers'] = len(dns.resolver.resolve(d, 'MX'))
    features['ttl_hostname'] = dns.resolver.resolve(d).rrset.ttl
    # https://www.geeksforgeeks.org/ssl-certificate-verification-python-requests/
    try:
        requests.get('https://' + d)
        features['tls_ssl_certificate'] = True
    except SSLCertVerificationError:
        features['tls_ssl_certificate'] = False
    features['qty_redirects'] = sum([True if h.status_code in [301, 302] else False for h in resp.history])
    features['url_google_index'] = 0 # TODO
    features['domain_google_index'] = 0 # TODO
    features['url_shortened'] = d.lower() in ['tinyurl.com', 'bit.ly', 't.co'] # TODO: add more URL shortening services
    return features

In [87]:
test_url = "https://www.google.com/search?q=test"

In [98]:
t1_counts = build_counts(count_chars(test_url), 'url')

In [99]:
table_1 = table_1_features(test_url, t1_counts)

In [100]:
table_1

{'qty_dot_url': 2,
 'qty_hyphen_url': 0,
 'qty_underline_url': 0,
 'qty_slash_url': 3,
 'qty_questionmark_url': 1,
 'qty_equal_url': 1,
 'qty_at_url': 0,
 'qty_and_url': 0,
 'qty_exclamation_url': 0,
 'qty_space_url': 0,
 'qty_tilde_url': 0,
 'qty_comma_url': 0,
 'qty_plus_url': 0,
 'qty_asterisk_url': 0,
 'qty_hashtag_url': 0,
 'qty_dollar_url': 0,
 'qty_percent_url': 0,
 'qty_tld_url': 3,
 'length_url': 36,
 'email_in_url': False}

In [101]:
t2_counts = build_counts(count_chars(tldextract.extract(test_url).domain), 'domain')

In [102]:
table_2 = table_2_features(test_url, t2_counts)

In [103]:
table_2

{'qty_dot_domain': 0,
 'qty_hyphen_domain': 0,
 'qty_underline_domain': 0,
 'qty_slash_domain': 0,
 'qty_questionmark_domain': 0,
 'qty_equal_domain': 0,
 'qty_at_domain': 0,
 'qty_and_domain': 0,
 'qty_exclamation_domain': 0,
 'qty_space_domain': 0,
 'qty_tilde_domain': 0,
 'qty_comma_domain': 0,
 'qty_plus_domain': 0,
 'qty_asterisk_domain': 0,
 'qty_hashtag_domain': 0,
 'qty_dollar_domain': 0,
 'qty_percent_domain': 0,
 'qty_vowels_domain': 3,
 'domain_length': 6,
 'domain_in_ip': False,
 'server_client_domain': False}

In [104]:
t3_counts = build_counts(count_chars(os.path.split(urlparse(test_url).path)[0].lstrip('/')), 'directory')

In [105]:
table_3 = table_3_features(test_url, t3_counts)

In [106]:
table_3

{'qty_dot_directory': -1,
 'qty_hyphen_directory': -1,
 'qty_underline_directory': -1,
 'qty_slash_directory': -1,
 'qty_questionmark_directory': -1,
 'qty_equal_directory': -1,
 'qty_at_directory': -1,
 'qty_and_directory': -1,
 'qty_exclamation_directory': -1,
 'qty_space_directory': -1,
 'qty_tilde_directory': -1,
 'qty_comma_directory': -1,
 'qty_plus_directory': -1,
 'qty_asterisk_directory': -1,
 'qty_hashtag_directory': -1,
 'qty_dollar_directory': -1,
 'qty_percent_directory': -1,
 'directory_length': -1}

In [107]:
t4_counts = build_counts(count_chars(os.path.split(urlparse(test_url).path)[1]), 'file')

In [108]:
table_4 = table_4_features(test_url, t4_counts)

In [109]:
table_4

{'qty_dot_file': 0,
 'qty_hyphen_file': 0,
 'qty_underline_file': 0,
 'qty_slash_file': 0,
 'qty_questionmark_file': 0,
 'qty_equal_file': 0,
 'qty_at_file': 0,
 'qty_and_file': 0,
 'qty_exclamation_file': 0,
 'qty_space_file': 0,
 'qty_tilde_file': 0,
 'qty_comma_file': 0,
 'qty_plus_file': 0,
 'qty_asterisk_file': 0,
 'qty_hashtag_file': 0,
 'qty_dollar_file': 0,
 'qty_percent_file': 0,
 'file_length': 6}

In [110]:
t5_counts = build_counts(count_chars(urlparse(test_url).query), 'params')

In [111]:
table_5 = table_5_features(test_url, t5_counts)

In [112]:
table_5

{'qty_dot_params': 0,
 'qty_hyphen_params': 0,
 'qty_underline_params': 0,
 'qty_slash_params': 0,
 'qty_questionmark_params': 0,
 'qty_equal_params': 1,
 'qty_at_params': 0,
 'qty_and_params': 0,
 'qty_exclamation_params': 0,
 'qty_space_params': 0,
 'qty_tilde_params': 0,
 'qty_comma_params': 0,
 'qty_plus_params': 0,
 'qty_asterisk_params': 0,
 'qty_hashtag_params': 0,
 'qty_dollar_params': 0,
 'qty_percent_params': 0,
 'params_length': 6,
 'tld_present_params': False,
 'qty_params': 1}

In [27]:
table_6 = table_6_features(test_url)

In [28]:
table_6

{'time_response': 3.605365753173828,
 'domain_spf': True,
 'asn_ip': '15169',
 'time_domain_activation': 9121,
 'time_domain_expiration': 2200,
 'qty_ip_resolved': 1,
 'qty_nameservers': 4,
 'qty_mx_servers': 1,
 'ttl_hostname': 217,
 'tls_ssl_certificate': True,
 'qty_redirects': 1,
 'url_google_index': 0,
 'domain_google_index': 0,
 'url_shortened': False}

In [42]:
pd.Series(table_1 | table_2 | table_3 | table_4 | table_5 | table_6)

qty_dot_url                 2
qty_hyphen_url              0
qty_underline_url           0
qty_slash_url               3
qty_questionmark_url        1
                        ...  
tls_ssl_certificate      True
qty_redirects               1
url_google_index            0
domain_google_index         0
url_shortened           False
Length: 111, dtype: object

In [118]:
def build_inference(url):
    t1_counts = build_counts(count_chars(test_url), 'url')
    table_1 = table_1_features(test_url, t1_counts)
    t2_counts = build_counts(count_chars(tldextract.extract(test_url).domain), 'domain')
    table_2 = table_2_features(test_url, t2_counts)
    t3_counts = build_counts(count_chars(os.path.split(urlparse(test_url).path)[0].lstrip('/')), 'directory')
    table_3 = table_3_features(test_url, t3_counts)
    t4_counts = build_counts(count_chars(os.path.split(urlparse(test_url).path)[1]), 'file')
    table_4 = table_4_features(test_url, t4_counts)
    t5_counts = build_counts(count_chars(urlparse(test_url).query), 'params')
    table_5 = table_5_features(test_url, t5_counts)
    table_6 = table_6_features(test_url)
    return pd.Series(table_1 | table_2 | table_3 | table_4 | table_5 | table_6)

In [116]:
u = build_inference(test_url)

In [117]:
u

qty_dot_url                 2
qty_hyphen_url              0
qty_underline_url           0
qty_slash_url               3
qty_questionmark_url        1
                        ...  
tls_ssl_certificate      True
qty_redirects               1
url_google_index            0
domain_google_index         0
url_shortened           False
Length: 111, dtype: object