# Task

Download and process an arbitrary file from Common Crawl (https://index.commoncrawl.org/), extract individual items, perform basic statistical analysis (distribution of hosts, words, languages, domains etc.) and visualization (optional).

# Solution

Please, run jupyter notebook in AcquisitionAndAnalysis directory to give access to prjnlp_utils package.

First, get the link to warc file manually (we should process one warc only, so we don't need to automate this mechanism), using following steps:

0. Open https://index.commoncrawl.org/ page.
0. Download cc-index.paths.gz for February 2019
0. Unpack it, get the first line 'cc-index/collections/CC-MAIN-2019-09/indexes/cdx-00000.gz'. Add to this filename prefix 'https://commoncrawl.s3.amazonaws.com'
0. Download and unpack https://commoncrawl.s3.amazonaws.com/cc-index/collections/CC-MAIN-2019-09/indexes/cdx-00000.gz.
0. Find in unpacked cdx-00000 any warc file name you want. I randomly selected crawl-data/CC-MAIN-2019-09/segments/1550247479627.17/warc/CC-MAIN-20190215224408-20190216010408-00052.warc.gz
0. Add the same prefix https://commoncrawl.s3.amazonaws.com and get the link to the warc file which we will to process further.

https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479627.17/warc/CC-MAIN-20190215224408-20190216010408-00052.warc.gz

In [3]:
import sys
sys.path.append('..')

import os
from prjnlp_utils import download_with_progress

In [4]:
warc_gz_url: str = \
    'https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479627.17/warc/CC-MAIN-20190215224408-20190216010408-00052.warc.gz'
archive_name: str = os.path.join(os.getcwd(),
                                 'data/CC-MAIN-20190215224408-20190216010408-00052.warc.gz')


In [5]:
# download wark:

download_with_progress(warc_gz_url, archive_name)

File 'D:\git-nlp\ss-prj-nlp-2019\students\SergeSotnyk\03-data\AcquisitionAndAnalysis\04-Common-Crawl\data\CC-MAIN-20190215224408-20190216010408-00052.warc.gz' is already existed, downloading was skipped.


In [10]:
import gzip
from tqdm.auto import tqdm
from collections import namedtuple, defaultdict
import html2text
from selectolax.parser import HTMLParser
from textblob import TextBlob
import langdetect
from bs4 import BeautifulSoup

SinglePageInfo = namedtuple('SinglePageInfo', ['target_uri', 'server', 'language'])

tokens = defaultdict(int)
langs = defaultdict(int)
uris = set()

h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_tables = True
h.ignore_emphasis = True

def get_text_selectolax(html):
    tree = HTMLParser(html)

    if tree.body is None:
        return None

    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    text = tree.body.text(separator='\n')
    return text

def get_text_bs(html):
    tree = BeautifulSoup(html, 'lxml')

    body = tree.body
    if body is None:
        return None

    for tag in body.select('script'):
        tag.decompose()
    for tag in body.select('style'):
        tag.decompose()

    text = body.get_text(separator='\n')
    return text

def process_part(part):
    TARGET_URI_START = 'WARC-Target-URI:'
    if len(part) > 1 and part[1].startswith('WARC-Type: response'):
        html = ''.join(part)
        htmlstart = '<html>'
        pos = html.find(htmlstart)
        if pos >= 0:
            html = html[pos:]
            # txt = h.handle(html)
            txt = get_text_bs(html)
            if not txt:
                return
            blob = TextBlob(txt)
            # blob_lang = blob if len(txt)<2048 else TextBlob(txt[:2048])
            try:
                lang = langdetect.detect(txt if len(txt)<2048 else txt[:2048])
            except:
                lang = 'unknown'
            langs[lang] += 1
            for t in blob.words:
                tokens[t] += 1
            url = next((l for l in part[:20] if l.startswith(TARGET_URI_START)), None)
            if url:
                url = url[len(TARGET_URI_START):].strip()
                uris.add(url)
            

def process_warc_gz(filename: str):
    lines_counter = 0
    res = []
    with gzip.open(filename, 'rt', encoding='utf-8', errors='replace') as f:
        # 52362938 - it is hardcoded value, I've counted it in the first experiment
        part = []

        for line in tqdm(f, total=52362938, unit='lines'):
            lines_counter += 1
            if line.startswith('WARC/1.0'):  # new part started
                if part:
                    process_part(part)
                    # res.append(part)
                part = []
                # if lines_counter>5000000:
                #    break
            part.append(line)

        if part:  # last part
            process_part(part)
            # res.append(part)

    print(f"Total lines: {lines_counter}")
    return res


In [11]:
# %load_ext line_profiler
# %lprun -f process_part process_warc_gz(archive_name)
process_warc_gz(archive_name)

HBox(children=(IntProgress(value=0, max=52362938), HTML(value='')))

Total lines: 52362938


[]

In [14]:
langs_list = sorted([(k, v) for k, v in langs.items() if k.isalnum()], 
                     key=lambda x: -x[1])
print(langs_list)

[('en', 2413), ('ru', 611), ('bn', 565), ('de', 307), ('fr', 192), ('ja', 153), ('es', 145), ('unknown', 132), ('ko', 115), ('nl', 87), ('it', 81), ('pl', 79), ('pt', 70), ('cs', 67), ('tr', 56), ('fa', 45), ('sv', 40), ('ca', 32), ('id', 31), ('vi', 30), ('uk', 30), ('ro', 29), ('hu', 27), ('da', 24), ('no', 24), ('fi', 22), ('sk', 20), ('el', 19), ('et', 14), ('bg', 12), ('th', 12), ('sl', 11), ('cy', 11), ('ar', 11), ('tl', 11), ('lt', 10), ('he', 9), ('hr', 9), ('af', 5), ('so', 3), ('lv', 3), ('hi', 2), ('ta', 2)]


In [15]:
print(f"Total tokens {len(tokens)}")
tokens_list = sorted([(k, v) for k, v in tokens.items() if k.isalnum()], 
                     key=lambda x: -x[1])
print(f"Cleared tokens {len(tokens_list)}")
print("The most frequent tokens:")
print(tokens_list[:20])

Total tokens 630222
Cleared tokens 476255
The most frequent tokens:
[('a', 968935), ('e', 835190), ('t', 777933), ('i', 677225), ('s', 639936), ('r', 577643), ('o', 559186), ('n', 539051), ('l', 502788), ('d', 471864), ('p', 402999), ('h', 351213), ('c', 341700), ('m', 279178), ('f', 278348), ('1', 270833), ('0', 231168), ('g', 212273), ('2', 202892), ('b', 201289)]


In [16]:
print(f"Total uries {len(uris)}")

Total uries 5889
