In [16]:
import json
import re
from collections import OrderedDict as od
from operator import itemgetter

from pprint import pprint as pp

# Global constants
DEBUG = False
HOSTNAME_PATTERN = """(https?:\/\/)([^:^\/]*)(:\\d*)?(.*)?"""

filename = 'cdx-00000'


In [None]:
def count_host_distribution(filename):
    """Count representation of host/domain names in a crawl index
    Returns OrderedDict() of {host: num_occurrences}"""
    
    c = 0
    hosts = {}
    f = open(filename, 'r')
    
    for line in f:
        json_string = line.split(' ', 2)[2]
        json_data = json.loads(json_string)
        
        # Parse hostname from url
        try:
            hostname = re.findall(HOSTNAME_PATTERN, json_data['url'].lower())[0][1]
        except IndexError:
            print("Error extracting hostname from:", json_data['url'].lower())
        
        # Tally hostnames
        if hostname not in hosts:
            hosts[hostname] = 1
        else:
            hosts[hostname] += 1
        
        # Debug limiter to test on smaller scope
        if c > 100000 and DEBUG:
            break
        else:
            c+=1
    
    # Order items
    ordered_hosts = od(sorted(hosts.items(), key=itemgetter(1), reverse=True))
        
    return ordered_hosts

hosts_json = json.dumps(count_host_distribution(f))
with open('hosts_distr.json', 'w+') as json_file:
    json_file.write(hosts_json)

In [12]:
from bs4 import BeautifulSoup as bs
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import requests

# Suppress "https error" warning
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def count_langs(filename):
    """Count representation of languages in a crawl index
    Returns OrderedDict() of {lang: num_occurrences}"""
    
    f = open(filename, 'r')
    c = 0
    langs = {}
    
    for line in f:
        json_string = line.split(' ', 2)[2]
        json_data = json.loads(json_string)
        
        # Parse hostname from url
        url = json_data['url'].lower()
        
        # Ignore robots.txt
        if '/robots.txt' in url:
            continue
        
        try:
            lang_list = json_data['languages'].lower().split(',')
        except KeyError:
            continue
        
        """
        # Alternatively, languages can be detected directly
        # by requesting a page and running a detection library 
        # on the page's text, but this has proven to be slower
        # and less accurate
        
        # But it proves the concept that pages can be visited 
        # and analyzed individually
        
        lang_list = []
        try:
            r = requests.get(url, verify=False)
        except:
            print("HTTP Request Error:", url)
            continue
        
        soup = bs(r.text, 'html.parser')
        
        try:
            lang = detect(soup.get_text())
            lang_list.append(lang)
        except LangDetectException:
            print("Lang detect error:", url)
            continue        
        """
        
        # Tally languages
        for lang in lang_list:
            if lang not in langs:
                langs[lang] = 1
            else:
                langs[lang] += 1
        
        # Debug limiter and output
        if c > 1000 and DEBUG:
            break
        else:
            # print(c, url, lang)
            c+=1
    
    # Order items
    ordered_langs = od(sorted(langs.items(), key=itemgetter(1), reverse=True))
        
    return ordered_langs

langs_json = json.dumps(count_langs(f))
pp(langs_json)

with open('langs_distr.json', 'w+') as json_file:
    json_file.write(langs_json)

('{"spa": 5183261, "eng": 3639422, "hye": 533099, "rus": 367110, "sqi": '
 '261662, "ara": 219028, "zho": 157002, "deu": 131672, "por": 97808, "jpn": '
 '89145, "fra": 81097, "ind": 65002, "cat": 63185, "ita": 50238, "tur": 44430, '
 '"fas": 39098, "lat": 38556, "nld": 37310, "dan": 36754, "tha": 36742, "srp": '
 '32959, "grn": 27273, "ron": 21110, "kor": 20500, "pus": 18778, "ukr": 16855, '
 '"pol": 13823, "tat": 11032, "ces": 7618, "uzb": 7068, "vie": 7038, "ina": '
 '6576, "oci": 6490, "hin": 5840, "bul": 5329, "war": 5293, "aar": 5095, '
 '"ell": 4941, "swe": 4805, "msa": 4797, "hun": 4219, "fin": 4136, "nor": '
 '3969, "kin": 3864, "nno": 3639, "aze": 3173, "mal": 2794, "hrv": 2790, '
 '"kat": 2719, "slk": 2631, "kal": 2563, "cos": 2430, "ile": 2357, "slv": '
 '1974, "lit": 1732, "bel": 1730, "heb": 1643, "vol": 1550, "est": 1536, '
 '"lin": 1278, "lav": 1218, "tam": 1157, "bos": 1130, "san": 1089, "mon": '
 '1085, "mkd": 980, "urd": 930, "eus": 892, "ben": 891, "sco": 830, "glg":

In [18]:
# This is an exploratory cell to study entry structure.
f = open(filename, 'r')
for i in range(0, 3):
    print(f.readline())

0,0,0,1)/ 20190222170014 {"url": "http://1.000.000.000/", "mime": "text/html", "mime-detected": "text/html", "status": "403", "digest": "MRQTVY26B5MGVL2UCLQW6QWN7VMLMYCJ", "length": "1897", "offset": "21996", "filename": "crawl-data/CC-MAIN-2019-09/segments/1550247518497.90/crawldiagnostics/CC-MAIN-20190222155556-20190222181556-00457.warc.gz"}

0,0,0,1)/robots.txt 20190222170014 {"url": "http://1.000.000.000/robots.txt", "mime": "text/html", "mime-detected": "text/html", "status": "403", "digest": "BTYSRH4M542AOCYIHRFCQ5CR7TBPGK2Y", "length": "1903", "offset": "2068", "filename": "crawl-data/CC-MAIN-2019-09/segments/1550247518497.90/robotstxt/CC-MAIN-20190222155556-20190222181556-00212.warc.gz"}

0,0,135,5)/ 20190218010049 {"url": "http://5.135.0.0/", "mime": "text/html", "mime-detected": "text/html", "status": "200", "digest": "AH72W5G37N5O6ZJDCHMSIBVLFHZ4GQJR", "length": "18813", "offset": "2516608", "filename": "crawl-data/CC-MAIN-2019-09/segments/1550247483873.51/warc/CC-MAIN-20190