Same as v1, but with stricter filtering by language

In [1]:
from urllib.parse import urlparse, urljoin
import trafilatura
from tqdm.auto import tqdm, trange
import requests
from bs4 import BeautifulSoup
from courlan import clean_url
import json

In [2]:
import fasttext
from collections import Counter

class LanguageDetector:
    def __init__(self, path="../langid/lid.323.bin"):
        self.model = fasttext.load_model(path)

    def predict_lang(self, text, k=10):
        text = text.replace('\n', '  ')
        langs, proba = self.model.predict(text, k=k)
        res = Counter(dict(zip([lang[9:] for lang in langs], proba)))
        for key in ['ru', 'myv']:
            if key not in res:
                res[key] = 0
        return res
    
LD = LanguageDetector()




In [3]:
seeds = '''
	https://erziana.my1.ru/publ/tatjana_matorkina_mizolkst/1-1-0-712
		https://erziana.my1.ru/index/proza/0-10
	https://erzinform.ucoz.ru
	http://erzan.ru
		http://erzan.ru/news/vozdvizhenie-goroda-perevod-s-jerzjanskogo-oshon-putoma-mastorava-erzjanskij-jepos
		http://erzan.ru/news/sonety-shekspira-na-jerzjanskom-jazyke-perevod-kuzmy-abramova
	http://erzianraske.forum24.ru
	http://lazalyk.narod.ru/
	http://erzianj.borda.ru/
    http://goloserzi.ru
'''.split()

In [4]:
white_domains = {urlparse(url).netloc for url in seeds}
white_domains

{'erzan.ru',
 'erziana.my1.ru',
 'erzianj.borda.ru',
 'erzianraske.forum24.ru',
 'erzinform.ucoz.ru',
 'goloserzi.ru',
 'lazalyk.narod.ru'}

In [5]:
downloaded_pages = dict()
downloaded_myv = dict()
urls_queue = set(seeds)
tot = 0
all_myv = set()

In [6]:
bad_domains = {
    'wikipedia.org',
    'wikidata.org',
    'wikisource.org',
    'openstreetmap.org',
    'wikimedia.org',
    'google.com',
    'wiktionary.org',
    'wikibooks.org',
    'util.unicode.org'
}
bad_extensions = {'pdf', 'jpg', 'mp3', 'sig', 'rtf', 'doc', 'docx'}

In [7]:
html = requests.get('https://erziana.my1.ru/publ/stikhi/dmitrij_taganov_monen_marjavs_vese_uli_mejle/2-1-0-164').text

In [8]:
soup = BeautifulSoup(html)

In [9]:
def text_with_newlines(elem):
    text = ''
    for e in elem.children:
        if isinstance(e, str):
            text += e.strip()
            continue
        elif e.name in {'script', 'style'}:
            continue
        else:
            if e.name in {'br', 'p', 'div'}:
                text += '\n'
            text += text_with_newlines(e)
    return text

tq = trange(10)
for i in tq:
    if not urls_queue:
        break
    url = urls_queue.pop()
    
    if 'javascript' in url or any(d in url for d in bad_domains):
        continue
    if url.split('.')[-1].lower() in bad_extensions:
        continue
    
    print(url)
    try:
        html = requests.get(url, timeout=5).text
    except Exception as e:
        print(e)
        continue
    downloaded_pages[url] = html
    
    soup = BeautifulSoup(html)
    content = trafilatura.extract(html, favor_recall=True)
    has_new_myv = False
    if not content:
        langs = []
    else:
        lines = content.split('\n')
        langs = [LD.predict_lang(t).most_common(1)[0][0] for t in lines]
        for line, lang in zip(lines, langs):
            if lang == 'myv':
                if line not in all_myv:
                    has_new_myv = True
                    all_myv.add(line)
                    
    print(has_new_myv, set(langs))
    if has_new_myv:
        tot += 1
        tq.set_description(str(tot) + ' ' + str(len(all_myv)))

    if urlparse(url).netloc not in white_domains and not has_new_myv:
        continue

    
    for a in soup.findAll('a'):
        if not a.has_attr('href'):
            # skip anchors
            continue
        href = a['href']
        if href.startswith('#'):
            continue
        if 'javascript' in href:
            continue
        if any(d in href for d in bad_domains):
            continue # because we parse Wikipedia separately
        if href.split('.')[-1].lower() in bad_extensions:
            continue
        new_url = urljoin(url, href).split('#')[0]
        if new_url not in downloaded_pages:
            urls_queue.add(new_url)
        

In [10]:
import urllib3
http = urllib3.PoolManager(num_pools=100)

In [73]:
import httpx

In [76]:
# httpx.get('https://github.com/', timeout=1).text

'\n\n\n\n\n\n<!DOCTYPE html>\n<html lang="en"  data-a11y-animated-images="system">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-ksfTgQOOnE+FFXf+yNfVjKSlEckJAdufFIYGK7ZjRhWcZgzAGcmZqqArTgMLpu90FwthqcCX4ldDgKXbmVMeuQ==" rel="stylesheet" href="https://github.githubassets.com/assets/light-92c7d381038e.css" /><link crossorigin="anonymous" media="all" integrity="sha512-1KkMNn8M/al/dtzBLupRwkIOgnA9MWkm8oxS+solP87jByEvY/g4BmoxLihRogKcX1obPnf4Yp7dI0ZTWO+ljg==" rel="stylesheet" href="https://github

In [101]:
import eventlet
from eventlet.green.urllib.request import urlopen

In [120]:
resp = urlopen('http://erzianj.borda.ru/')

In [121]:
resp.status == 200

True

In [122]:
res = resp.read()
res

b'<!DOCTYPE html><HTML>\r<HEAD>\r<TITLE>\xcc\xe5\xeb\xfc\xea\xf3\xe6\xee \xdd\xf0\xe7\xff\xed\xfc \xcc\xe0\xf1\xf2\xee\xf0 - \xd4\xee\xf0\xf3\xec \xd1\xf2\xf0\xe0\xed\xe0 \xdd\xf0\xe7\xff\xed, \xd4\xee\xf0\xf3\xec \xdd\xf0\xe7\xff \xed\xe0\xf0\xee\xe4\xe0</TITLE>\r<STYLE type=text/css>\rBODY{margin:10px;margin-top:15px;margin-bottom:10px;}\rtd{font-family:Verdana;}\r\r.font1{font-size:12px;}\r.font2{font-size:11px;font-weight:400;}\r.font3{font-size:11px;font-weight:700;}\r.font4{font-size:15px;}\r.font5{font-size:11px;font-weight:700;COLOR:#ffa450;}\r.font6{font-size:11px;font-weight:700;}\r\ra:link{color:#006699;text-decoration:none;}\ra:visited{color:#5493b4;text-decoration:none;}\ra:hover{color: #dd6900;text-decoration:underline;}\r\rtr.font3 {HEIGHT: 28px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon3.gif);}\rtr.font5 {HEIGHT: 30px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon4.gif);FONT-WEIGHT:700;}\rtr.font6 {HEIGHT: 28px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon3.gif)

In [126]:
urlopen('http://erzianj.borda.ru/').read().decode()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcc in position 36: invalid continuation byte

In [127]:
requests.get('http://erzianj.borda.ru/').text

'<!DOCTYPE html><HTML>\r<HEAD>\r<TITLE>Мелькужо Эрзянь Мастор - Форум Страна Эрзян, Форум Эрзя народа</TITLE>\r<STYLE type=text/css>\rBODY{margin:10px;margin-top:15px;margin-bottom:10px;}\rtd{font-family:Verdana;}\r\r.font1{font-size:12px;}\r.font2{font-size:11px;font-weight:400;}\r.font3{font-size:11px;font-weight:700;}\r.font4{font-size:15px;}\r.font5{font-size:11px;font-weight:700;COLOR:#ffa450;}\r.font6{font-size:11px;font-weight:700;}\r\ra:link{color:#006699;text-decoration:none;}\ra:visited{color:#5493b4;text-decoration:none;}\ra:hover{color: #dd6900;text-decoration:underline;}\r\rtr.font3 {HEIGHT: 28px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon3.gif);}\rtr.font5 {HEIGHT: 30px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon4.gif);FONT-WEIGHT:700;}\rtr.font6 {HEIGHT: 28px;BACKGROUND-IMAGE: url(//forum24.ru/gif/skin/fon3.gif);}\r\rinput,select,textarea{font-size:11px;font-family:Verdana;}\rtextarea{width:100%;height:120;}\rinput.sender{background-color:#ffffff;font-weight:700;

In [124]:
type(res)

bytes

In [125]:
res.decode('utf-8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcc in position 36: invalid continuation byte

In [77]:
def process_url(url, verbose=False, max_myv_threshold=0.99):
    """ return url, html and list of new urls """
    result = {'url': url}
    
    if url in downloaded_pages:
        return result
    if 'javascript' in url or any(d in url for d in bad_domains):
        return result
    if url.split('.')[-1].lower() in bad_extensions:
        return result

    try:
        # html = requests.get(url, timeout=5).text
        # html = http.request('GET', url, timeout=5).data.decode('utf-8')
        # html = httpx.get(url, timeout=5).text
        resp = urlopen(task)
        if resp.status != 200:
            return result
        h = resp.headers.as_string()
        
    except Exception as e:
        if verbose:
            print(url, e)
        return result
    
    result['html'] = html
    
    soup = BeautifulSoup(html)
    content = (trafilatura.extract(html, favor_recall=True) or '') + '\n\n' + text_with_newlines(soup)
    new_myv = set()
    max_myv_score = 0
    if not content:
        langs = []
    else:
        lines = content.split('\n')
        lang_preds = [LD.predict_lang(t) for t in lines]
        for line, lang_pred in zip(lines, lang_preds):
            if lang_pred.most_common(1)[0][0] == 'myv':
                if line not in all_myv:
                    new_myv.add(line)
                    max_myv_score = max(max_myv_score, lang_pred['myv'])
                    
    result['new_myv'] = new_myv
    result['max_myv_score'] = max_myv_score

    if urlparse(url).netloc not in white_domains and max_myv_score < max_myv_threshold:
        return result

    new_urls = []
    soup = BeautifulSoup(html)
    for a in soup.findAll('a'):
        if not a.has_attr('href'):
            # skip anchors
            continue
        href = a['href']
        if href.startswith('#'):
            continue
        if 'javascript' in href:
            continue
        if any(d in href for d in bad_domains):
            continue # because we parse Wikipedia separately
        if href.split('.')[-1].lower() in bad_extensions:
            continue
        new_url = urljoin(url, href).split('#')[0]
        if new_url not in downloaded_pages:
            new_urls.append(new_url)
    
    result['new_urls'] = new_urls
            
    return result

In [12]:
from collections import Counter, defaultdict

In [13]:
downloaded_pages = dict()
downloaded_myv = dict()
urls_queue = set(seeds)
domain_attempts = Counter()
domain_successes = Counter()
all_myv = set()

In [14]:
def get_state():
    state = {
        'downloaded_pages': downloaded_pages, 
        'downloaded_myv': {k: list(v) for k, v in downloaded_myv.items()},
        'urls_queue': list(urls_queue),
        'domain_attempts': domain_attempts,
        'domain_successes': domain_successes,
        'all_myv': list(all_myv)
    }
    return state
get_state().keys()

dict_keys(['downloaded_pages', 'downloaded_myv', 'urls_queue', 'domain_attempts', 'domain_successes', 'all_myv'])

In [15]:
# try to increase success rate by looking at more promising domains first: from 2% to 10-30%

def get_new_tasks(n=1000, max_fraction=0.2):
    domain2tasks = defaultdict(list)
    for task in urls_queue:
        domain2tasks[urlparse(task).netloc].append(task)
    success_rates = sorted([((domain_successes[d] + 1) / (domain_attempts[d] + 2), d) for d in domain2tasks], reverse=True)
    # print(success_rates[:20])
    tasks = []
    max_k = int(n * max_fraction)
    for _, d in success_rates:
        tasks.extend(domain2tasks[d][:max_k])
        if len(tasks) > n:
            break
    return tasks[:n]

In [16]:
# from concurrent.futures import ThreadPoolExecutor, TimeoutError
from multiprocessing.pool import ThreadPool
import gc

In [17]:
for epoch in trange(0):
    if not urls_queue:
        print('QUEUE EMPTY')
        break
    
    # tasks = [urls_queue.pop() for _ in range(min(100, len(urls_queue)))]
    # tasks = [u for i, u in zip(range(1000), urls_queue)]  # without removal
    tasks = get_new_tasks()
    success_count = 0
    # todo: try the most FERTILE domain names first. 
    executor = ThreadPool(100)

    for result in executor.map(process_url, tasks):
        url = result['url']
        urls_queue.remove(url)
        if result.get('html'):
            #downloaded_pages[url] = result['html']
            pass # save memory, and save html only of pages with myv content
        domain = urlparse(url).netloc
        domain_attempts[domain] += 1
        if result.get('max_myv_score', 0) > 0.99:
            domain_successes[domain] += 1
            success_count += 1
        if result.get('new_myv'):
            all_myv.update(result['new_myv'])
            downloaded_pages[url] = result['html']
            downloaded_myv[url] = result['new_myv']
        if result.get('new_urls'):
            # print(url, 'got new')
            for new_url in result['new_urls']:
                if new_url not in downloaded_pages:
                    urls_queue.add(new_url)
    print(epoch, len(downloaded_myv), len(all_myv), len(urls_queue), success_count / len(tasks))
    gc.collect()
    
    with open('other_sources/misс_websites_raw_texts_v2.json', 'w') as f:
        json.dump({k: list(v) for k, v in downloaded_myv.items()}, f, ensure_ascii=False, indent=2)
    with open('other_sources/misс_websites_state_v2.json', 'w') as f:
        json.dump(get_state(), f, ensure_ascii=False, indent=2)

0it [00:00, ?it/s]

In [81]:
for epoch in trange(1000):
    if not urls_queue:
        print('QUEUE EMPTY')
        break
    
    # tasks = [urls_queue.pop() for _ in range(min(100, len(urls_queue)))]
    # tasks = [u for i, u in zip(range(1000), urls_queue)]  # without removal
    tasks = get_new_tasks()
    success_count = 0
    # todo: try the most FERTILE domain names first. 
    #executor = ThreadPool(100)

    #for result in executor.map(process_url, tasks):
    for task in tqdm(tasks):
        result = process_url(task)
        url = result['url']
        urls_queue.remove(url)
        if result.get('html'):
            #downloaded_pages[url] = result['html']
            pass # save memory, and save html only of pages with myv content
        domain = urlparse(url).netloc
        domain_attempts[domain] += 1
        if result.get('max_myv_score', 0) > 0.99:
            domain_successes[domain] += 1
            success_count += 1
        if result.get('new_myv'):
            all_myv.update(result['new_myv'])
            downloaded_pages[url] = result['html']
            downloaded_myv[url] = result['new_myv']
        if result.get('new_urls'):
            # print(url, 'got new')
            for new_url in result['new_urls']:
                if new_url not in downloaded_pages:
                    urls_queue.add(new_url)
    print(epoch, len(downloaded_myv), len(all_myv), len(urls_queue), success_count / len(tasks))
    gc.collect()
    
    with open('other_sources/misс_websites_raw_texts_v2.json', 'w') as f:
        json.dump({k: list(v) for k, v in downloaded_myv.items()}, f, ensure_ascii=False, indent=2)
    with open('other_sources/misс_websites_state_v2.json', 'w') as f:
        json.dump(get_state(), f, ensure_ascii=False, indent=2)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [85]:
print(task)

http://s0.radioheart.ru:8000/RH40018


In [94]:
resp = urlopen(task)
resp

<eventlet.green.http.client.HTTPResponse at 0x1d3e3f853d0>

In [100]:
resp.headers.as_string()

'Content-Type: audio/mpeg\nDate: Mon, 27 Jun 2022 16:11:17 GMT\nicy-br: 128\nicy-pub: 0\nServer: Icecast 2.4.0-kh15\nCache-Control: no-cache, no-store\nExpires: Mon, 26 Jul 1997 05:00:00 GMT\nConnection: Close\nAccess-Control-Allow-Origin: *\nAccess-Control-Allow-Headers: Origin, Accept, X-Requested-With, Content-Type, Icy-MetaData\nAccess-Control-Allow-Methods: GET, OPTIONS, SOURCE, PUT, HEAD, STATS\n\n'

In [97]:
??resp

In [87]:
httpx.get(task, timeout=5)

KeyboardInterrupt: 

In [86]:
process_url(task)

KeyboardInterrupt: 

In [80]:
print(epoch, len(downloaded_myv), len(all_myv), len(urls_queue), success_count / len(tasks))

0 1442 23641 6044 0.003


In [69]:
import random
random.choice(list(all_myv))

'На память сделала два снимка:'

```
0   8  452  403 0.7
1 305 5278 2972 0.4317617866004963
2 872 11079 4797 0.191
```

```
2353 27570 12839 0.105
2686 34365 14455 0.128
3059 37420 17798 0.194
3739 44449 22486 0.529
4370 50168 23760 0.416
4818 53938 25097 0.324
5433 58765 28692 0.283
5857 61667 29971 0.294
6227 63838 29734 0.242
6629 65697 30020 0.251
7102 68802 30689 0.295
7482 71169 30943 0.275
7884 73405 30985 0.285
8179 75322 30822 0.241
```

In [None]:
gc.collect()

In [None]:
1

In [71]:
print(len(downloaded_pages), len(downloaded_myv), len(urls_queue), len(all_myv))

15683 15683 687731 55667


In [65]:
max(len(v) for v in downloaded_pages.values())

146048030

In [66]:
ks = {
    k for k, v in downloaded_pages.items() 
    if k not in downloaded_myv 
    # and k.endswith('.rtf')
}
for k in ks:
    del downloaded_pages[k]

In [67]:
ks

set()

In [68]:
for k, v in downloaded_pages.items():
    if k not in downloaded_myv and len(v) >= 1_048_030:
        print(k, len(v))

In [52]:
for k, v in downloaded_pages.items():
    if len(v) >= 146048030:
        print(k, len(v))

http://www.e-mordovia.ru/upload/iblock/63b/akt-avariynye-derevya-polnostyu_compressed.pdf.sig 146048030


myv_lines = sorted(all_myv)

In [390]:
import random
random.choice(myv_lines)

'Лекарь! Медик! Врач!\r'

In [498]:
with open('other_sources/mis_websites_raw_texts_v2.json', 'w') as f:
    json.dump({k: list(v) for k, v in downloaded_myv.items()}, f, ensure_ascii=False, indent=2)

In [401]:
for _ in range(10):
    print(random.choice(list(urls_queue)))

https://my.mail.ru/music/songs/starecase-see-timo-maas-mix-e30fd0ca0ed78371248aea6985c96be8
https://new.znanium.com/catalog/authors/books?ref=d7acdf96-d910-11e4-9a4d-00237dd2fde4
https://my.mail.ru/music/songs/кристина-орбакайте-каждый-день-с-тобой-11941be36d42777a298486125501247d
https://my.mail.ru/mail/nadezhda.paramonova.54/photo/_cover/9.html
https://my.mail.ru/mail/miss.stanovenkova/
https://my.mail.ru/music/songs/евгений-дога-евгений-дога-черная-вуаль-5fd9c8b6e9f4e3bd9f2a17a8e16986eb
http://merjamaa.ru/news/zakhoronenija_finno_ugrov/2017-07-08-1263
https://r.mail.ru/n368728622?&rnd=948999074
https://tavda.bezformata.com/realty/
http://ok.ru/dk?cmd=lang&st.ignoreRedirect=on&st.lang=tt&st.cmd=anonymMusicArtist&st.id=13728229155470&st._aid=Toolbar_ChangeLang


In [402]:
from collections import Counter

In [403]:
Counter([urlparse(u).netloc for u in urls_queue if not any(bd in u for bd in bad_domains)]).most_common(30)

[('my.mail.ru', 381276),
 ('ok.ru', 90034),
 ('r.mail.ru', 24302),
 ('www.sports.ru', 10650),
 ('www.mirea.ru', 9048),
 ('new.znanium.com', 6756),
 ('click.my.mail.ru', 5354),
 ('www.e-mordovia.ru', 4965),
 ('www.blogger.com', 3990),
 ('kino.mail.ru', 3562),
 ('erzan.ru', 3327),
 ('www.culture.ru', 3118),
 ('rostland.blogspot.com', 2946),
 ('vk.com', 2565),
 ('dobro.ru', 2233),
 ('tm.spbstu.ru', 2220),
 ('forum.ucoz.ru', 2210),
 ('www.spmi.ru', 1902),
 ('stroi.mos.ru', 1768),
 ('ss69100.livejournal.com', 1722),
 ('pedsovet.su', 1582),
 ('www.mordgpi.ru', 1564),
 ('merjamaa.ru', 1121),
 ('u.to', 1097),
 ('', 1094),
 ('evolution-march.livejournal.com', 950),
 ('podmoskovye.bezformata.com', 941),
 ('krasnodar.bezformata.com', 930),
 ('us.reasonable.shop', 885),
 ('cheboksari.bezformata.com', 861)]

In [405]:
Counter([urlparse(u).netloc for u in downloaded_myv if not any(bd in u for bd in bad_domains)]).most_common(30)

[('my.mail.ru', 7591),
 ('ok.ru', 1799),
 ('erziana.my1.ru', 462),
 ('erzinform.ucoz.ru', 343),
 ('new.znanium.com', 324),
 ('erzan.ru', 274),
 ('forum.ucoz.ru', 266),
 ('www.sports.ru', 253),
 ('merjamaa.ru', 170),
 ('www.culture.ru', 167),
 ('rostland.blogspot.com', 121),
 ('pedsovet.su', 112),
 ('tm.spbstu.ru', 93),
 ('www.mirea.ru', 88),
 ('www.e-mordovia.ru', 85),
 ('goloserzi.ru', 75),
 ('www.mordgpi.ru', 68),
 ('kino.mail.ru', 68),
 ('vk.com', 67),
 ('old.goloserzi.ru', 65),
 ('www.spmi.ru', 57),
 ('dobro.ru', 53),
 ('ispu.ru', 42),
 ('stroi.mos.ru', 36),
 ('istu.ru', 33),
 ('kostromka.ru', 32),
 ('www.merjamaa.ru', 29),
 ('uguide.ru', 28),
 ('rutube.ru', 28),
 ('merjamaa.ucoz.ru', 27)]

# Extract texts

Confidence: 20% is false positives, 35-37 is as well, 50% as well. Good Erzyan starts typically with 90+ confidence.

- 0.5-0.6 total FP
- 0.6-0.7 FP with very rare exceptions
- 0.7-0.8 same
- 0.8-0.9 same
- 0.9-0.95 same
- 0.95-0.96 same
- 0.96-0.97 same
- 0.97-0.98 many TP, but precision < 50%
- 0.98-0.99 precision somewhat less than 50%, but with better scraping it is going to improve. 

In [818]:
len(downloaded_myv)

15683

In [869]:
import pandas as pd

In [875]:
pd.options.display.max_colwidth = 300

In [870]:
scored = []

for url, lines in downloaded_myv.items():
    preds = [LD.predict_lang(text) for text in lines]
    max_myv = max(p['myv'] for p in preds)
    scored.append({'url': url, 'lines': lines, 'preds': preds, 'max_myv': max_myv})

scored = pd.DataFrame(scored)

In [890]:
f = (scored.max_myv > 0.99) & (scored.max_myv < 0.995)
print(sum(f))
scored[f].sample(1)

128


Unnamed: 0,url,lines,preds,max_myv
6282,http://ok.ru/music/album/13412948300621,{Осень 1},"[{'myv': 0.9916972517967224, 'udm': 0.005492622032761574, 'mdf': 0.002623192500323057, 'kk': 5.3013747674413025e-05, 'kv': 4.9871723604155704e-05, 'cv': 4.85976088384632e-05, 'uk': 4.161096876487136e-05, 'mn': 3.528898014337756e-05, 'koi': 2.4576333089498803e-05, 'ce': 1.732316923153121e-05, 'ru...",0.991697


In [891]:
scored['domain'] = scored.url.apply(lambda u: urlparse(u).netloc)

In [900]:
import numpy as np
def gq(val, q=0.99):
    return np.sum(val>q)

In [902]:
scored.groupby('domain').max_myv.aggregate([len, np.mean, sum, gq]).sort_values('gq', ascending=False).head(20)

Unnamed: 0_level_0,len,mean,sum,gq
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
erziana.my1.ru,462,0.902519,416.963973,303
my.mail.ru,7591,0.505735,3839.031592,97
erzinform.ucoz.ru,343,0.466978,160.17356,86
erzan.ru,274,0.70051,191.939794,81
ok.ru,1799,0.55469,997.886861,67
goloserzi.ru,75,0.803992,60.299437,27
old.goloserzi.ru,65,0.51752,33.638801,11
lazalyk.narod.ru,18,0.881573,15.868318,9
uralistica.com,13,0.811112,10.544452,8
vk.com,67,0.592437,39.69326,7
