Quick implementation without tricky statistic

In [1]:
import gzip
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse
from langdetect import detect
from collections import Counter
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
dataFile = "D:/Projects/NLP/data.warc.gz"

In [5]:
def get_content(html):
    soup = BeautifulSoup(html, "lxml")
    for script in soup(["script", "style"]):
        script.decompose()
    return re.sub("[\n\r\t]{1,}", "\n", ''.join(soup.get_text())).strip()

In [9]:
def parse_warc() :
    with open(dataFile, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response':            
                yield {
                    'uri' : record.rec_headers.get_header('WARC-Target-URI'),
                    'content' : get_content(record.raw_stream.read())
                }

In [10]:
def parse_host(url):
    return urlparse(url).netloc

def detect_lang(text):
    try:
        return detect(text)
    except:
        return None

def parse_features(text):
    cv = CountVectorizer(max_features=10000)
    matrix = cv.fit_transform([text])
    return cv.get_feature_names()
    

In [12]:
host_counter = Counter()
lang_counter = Counter()
word_counter = Counter()
for page in parse_warc():    
    content = page['content']
    lang = detect_lang(content)
    if lang != None: lang_counter.update([lang])
    if lang == 'en':
        words = parse_features(content)
        word_counter.update(words)
    host_counter.update([parse_host(page['uri'])])

  ' Beautiful Soup.' % markup)


In [13]:
host_counter.most_common(20)

[('m.mlb.com', 72),
 ('www.meetup.com', 48),
 ('www.urbandictionary.com', 48),
 ('www.popsugar.com', 41),
 ('www.engadget.com', 39),
 ('en.wikipedia.org', 36),
 ('www.beeradvocate.com', 34),
 ('www.gamefaqs.com', 34),
 ('www.cnet.com', 32),
 ('www.heraldnet.com', 32),
 ('www.agoda.com', 31),
 ('idahoptv.org', 30),
 ('www.boxofficemojo.com', 30),
 ('www.dpreview.com', 30),
 ('www.upi.com', 30),
 ('www.worldcat.org', 30),
 ('stackoverflow.com', 29),
 ('www.hockeyfights.com', 28),
 ('www.nytimes.com', 28),
 ('www.appbrain.com', 27)]

In [14]:
lang_counter.most_common(5)

[('en', 44072), ('es', 784), ('de', 608), ('fr', 551), ('pt', 257)]

In [15]:
word_counter.most_common(50)

[('to', 40085),
 ('the', 39867),
 ('of', 39093),
 ('and', 38537),
 ('in', 37798),
 ('for', 36342),
 ('by', 33894),
 ('all', 33561),
 ('on', 33484),
 ('is', 31234),
 ('your', 31031),
 ('this', 30899),
 ('about', 30784),
 ('us', 30770),
 ('with', 30743),
 ('you', 30270),
 ('contact', 28477),
 ('or', 27968),
 ('more', 27704),
 ('home', 27534),
 ('are', 26673),
 ('privacy', 26367),
 ('at', 26164),
 ('new', 26063),
 ('from', 25981),
 ('2016', 25873),
 ('be', 24228),
 ('it', 23639),
 ('search', 23569),
 ('an', 22701),
 ('not', 22579),
 ('up', 22543),
 ('that', 22515),
 ('have', 22498),
 ('our', 22216),
 ('com', 21757),
 ('policy', 21485),
 ('terms', 20648),
 ('as', 20263),
 ('site', 20178),
 ('use', 19793),
 ('help', 19663),
 ('10', 19519),
 ('we', 19485),
 ('no', 18899),
 ('can', 18895),
 ('my', 18773),
 ('rights', 18406),
 ('view', 18280),
 ('news', 18259)]