# Exploring Alexa Top 1 Million Dataset

Download data from https://data.openintel.nl/data/alexa1m/. You can use a script (src/data/download.py) to download and unpack files into `data/raw/` directory. Run the script directly from this notebook by uncommenting following line:

In [1]:
#%run -i '../src/data/download.py'

We use [fastavro](https://github.com/tebeka/fastavro) library to read AVRO files. It can be installed by running `conda env update` followed by `source activate mako`.

In [2]:
import fastavro as avro

file_path = '../data/raw/CO_4FF1624808913952EAC0DDF1851DA930.avro'
reader = avro.reader(open(file_path, 'rb'))
schema = reader.schema

In [3]:
# Use the Public Suffix List: https://publicsuffix.org/
import os,sys,inspect,importlib
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
project_dir = os.path.dirname(current_dir)
src_dir = os.path.join(project_dir, 'src')
sys.path.insert(0, src_dir) 
ps = importlib.import_module('publicsuffix.publicsuffix')
public_suffix_file_path = os.path.join(project_dir, 'data', 'raw', 'public_suffix_list.dat')
ps.init_suffix_tree(public_suffix_file_path)

In [4]:
def to_debug_str(domain, r):
    rtype = r['response_type']
    response_name = r['response_name']
    if not rtype:
        return 'Unknown response type'
    
    if rtype == 'A':
        return '%s %s %s' % (response_name, rtype, r['ip4_address'])
    
    if rtype == 'AAA':
        return '%s %s %s' % (response_name, rtype, r['ip6_address'])
    
    if rtype == 'CNAME':
        return '%s %s %s' % (response_name, rtype, r['cname_name'])
    
    return '%s %s %s' % (response_name, rtype, r['response_name'])

In [5]:
obj_hash = {}
for i in range(0, 1000):
    r = reader.next()
    if r:
        response_type = r['response_type']
        response_name = r['response_name']
        domain = ps.get_root_domain(response_name)
        
        if not domain is None:
            if not domain in obj_hash:
                obj_hash[domain] = []
            obj_hash[domain].append(to_debug_str(domain, r))
            #print('%s %s' % (response_type, response_name))

In [6]:
obj_hash.keys()

dict_keys(['canalplusgroupe.com', 'balancebeamsituation.com', 'emissary.io', 'lgi.de', 'confessionsofatrolleydolly.com', 'abilet.pl', 'eliquid-shop.com', 'jcstaff.com', 'googleusercontent.com', 'epopeyasvirales.org', 'gofingerstyle.com', 'absolutdelicios.blogspot.ro', 'iwatchonline.ph', 'yandex.ru', 'doyugames.com', 'bundpol.de', 'jdand.co', 'best-free-wallpapers.xyz', 'dizigold.org', 'iosandroidapp.com', 'microsoftonline.com', 'davidjb.com', 'danskeforsikring.dk', 'agoenghanyokrokusumo.blogspot.co.id', 'arcus-mc.com', 'yandex.net', 'daz-lernwerkstatt.de', 'cccd.dk', 'la-couronne.org', 'gambarabgbugil.xyz', 'bogner-team.de', 'ffi.no', 'abc.nl', 'nsatc.net', 'cdn13.com', 'droneanalyst.com', 'goodbye2016.com', 'librosgratisparaeluniversitario.blogspot.com', 'ovh.net', 'bigcartel.map.fastly.net', 'blogaudiopremiere.blogspot.mx', 'ladanivabelgium.be', 'karelshungit.com', 'lifeafterhysterectomy.com', 'cmicho.blogspot.com', 'essbett.com', 'facildeconsertar.blogspot.com.br', 'chapka.fr', 'hot

In [9]:
for domain in obj_hash.keys():
    print('Domain: %s' % domain)
    for entry in obj_hash[domain]:
        print('  %s' % entry)
    print('\n')

Domain: canalplusgroupe.com
  canalplusgroupe.com. SOA canalplusgroupe.com.
  canalplusgroupe.com. A 194.4.244.108
  www.canalplusgroupe.com. A 194.4.244.108
  canalplusgroupe.com. NS canalplusgroupe.com.
  canalplusgroupe.com. NS canalplusgroupe.com.
  canalplusgroupe.com. NS canalplusgroupe.com.


Domain: balancebeamsituation.com
  balancebeamsituation.com. SOA balancebeamsituation.com.
  balancebeamsituation.com. A 192.0.78.24
  balancebeamsituation.com. A 192.0.78.25
  balancebeamsituation.com. A 192.0.78.24
  balancebeamsituation.com. A 192.0.78.25
  www.balancebeamsituation.com. CNAME balancebeamsituation.com.
  balancebeamsituation.com. A 192.0.78.24
  balancebeamsituation.com. A 192.0.78.25
  mail.balancebeamsituation.com. CNAME balancebeamsituation.com.
  www.balancebeamsituation.com. CNAME balancebeamsituation.com.
  mail.balancebeamsituation.com. CNAME balancebeamsituation.com.
  balancebeamsituation.com. NS balancebeamsituation.com.
  balancebeamsituation.com. NS balancebea