### Common Crawl statistics

##### Refrences
* List of index files: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/cc-index.paths.gz
* Index: https://commoncrawl.s3.amazonaws.com/cc-index/collections/CC-MAIN-2019-09/indexes/cdx-00100.gz (0.8Gb)
* WARC file: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247482347.44/warc/CC-MAIN-20190217172628-20190217194628-00620.warc.gz (1.0Gb)
* [The WARC Format 1.1](http://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/)

In [1]:
import gzip
import re
from collections import Counter
import json

In [2]:
re_host = re.compile(r'(?<=:\/\/)[^\/]+')
re_top = re.compile(r'\w+\s*\.?$')
re_charset = re.compile(r'((?<=charset=["'+r"'])|(?<=charset=))[\w\d-]+", re.IGNORECASE)
re_notags = re.compile(r'<[^>]*>')

In [3]:
hosts = Counter()
tops = Counter()
languages = Counter()

warc_type = ''
resp_type = ''
encoding = 'utf-8'
cont_length = 0


gz = gzip.open('/Users/aromanov/Downloads/CC-MAIN-20190217172628-20190217194628-00620.warc.gz', 'r')

b = gz.readline()
l = b.decode(encoding)

In [4]:
responses = 0
total_length = 0

while b:
    if l.startswith('WARC/1.'):
        warc_type = ''
        resp_type = ''
        encoding = 'utf-8'

    elif l.startswith('WARC-Type:'):
        warc_type = l.split(':')[1].strip()

    if warc_type == 'response':
        if l.startswith('Content-Length:'):
            cont_length = int(l.split(':')[1].strip())
            responses += 1
            total_length += cont_length

        elif l.startswith('WARC-Identified-Payload-Type:'):
            resp_type = l.split(':')[1].strip()
            gz.seek(cont_length + 2, 1)

        elif l.startswith('WARC-Target-URI:'):
            match = re_host.search(l)
            if match:
                host_url = match.group(0)
                hosts[host_url] += 1
                top_d = re_top.search(host_url).group(0)
                tops[top_d] += 1

        elif resp_type in ['text/html', 'application/xhtml+xml']:
            # define charset of the response
            match = re_charset.search(l)
            if match:
                encoding = match.group(0)

    elif warc_type == 'metadata':
        if l.startswith('languages-cld2:'):
            j = json.loads(l[16:])
            if 'languages' in j:
                lang = j['languages'][0]['name']
                languages[lang] += 1

    b = gz.readline()
    l = b.decode(encoding)

In [5]:
print('Total pages stored =', responses)
print('Average content length =', int(total_length/responses))
print('Top languages: ', languages.most_common(20))
print('Top-level domains: ', tops.most_common(20))
print('Top hosts: ', hosts.most_common(10))

Total pages stored = 45025
Average content length = 84311
Top languages:  [('ENGLISH', 19341), ('RUSSIAN', 3803), ('GERMAN', 2539), ('Japanese', 2390), ('FRENCH', 2071), ('Chinese', 2006), ('SPANISH', 1839), ('ITALIAN', 1062), ('PORTUGUESE', 966), ('DUTCH', 746), ('POLISH', 728), ('CZECH', 465), ('ChineseT', 411), ('TURKISH', 379), ('INDONESIAN', 362), ('VIETNAMESE', 354), ('SWEDISH', 334), ('Korean', 322), ('PERSIAN', 291), ('ARABIC', 285)]
Top-level domains:  [('com', 20679), ('ru', 2557), ('org', 2403), ('net', 1900), ('de', 1791), ('uk', 1087), ('jp', 955), ('it', 745), ('fr', 713), ('info', 637), ('pl', 628), ('nl', 606), ('br', 527), ('cz', 455), ('ua', 444), ('au', 443), ('cn', 427), ('es', 377), ('se', 316), ('eu', 313)]
Top hosts:  [('stat.ripe.net', 12), ('www.tumblr.com', 12), ('alwafd.news', 11), ('www.galaxus.ch', 11), ('www.deviantart.com', 10), ('500px.com', 9), ('www.aljazeera.com', 9), ('www.urdupoint.com', 9), ('forums.ubi.com', 8), ('www.elperiodico.com', 8)]
