**Подсчет статитстики по выгруженным документам. Смотрим на ревизии подряд. Смотрим только добавленные документы**

In [1]:
from wikiparser_utils import WikiXMLDump, WikiPage
import os
import nltk
import json
import numpy as np
from tqdm.notebook import tqdm
from difflib import Differ 
from utils.difflibparser import DifflibParser, DiffCode
import html2text
import os
import time
from os import listdir
from os.path import isfile, join

In [2]:
dump = WikiXMLDump('data/history.xml')

In [3]:
DOCS_DIR = 'data/documents'
DOCS_MAPPER_DIR = 'data/documents_mapper'

if not os.path.exists(DOCS_MAPPER_DIR):
    os.makedirs(DOCS_MAPPER_DIR)
    
if not os.path.exists(DOCS_DIR):
    os.makedirs(DOCS_DIR)
    

In [4]:
def links_diff(links_old, links_new):
    '''
    Возращает добавленные ссылки
    '''
    set_old = set(links_old)
    set_new = set(links_new)
    set_diff = set_new - set_old
    set_diff2 = set_old - set_new
    return list(set_diff)

def text_diff(old_rev, new_rev, sentence_tokenizer=nltk.sent_tokenize, epsilon=2, beta=2*2):
    '''
    Бьет текст страницы на предложения (с помощью sentence_tokenizer)
    Берем абзацы из epsilon * 2 + 1 предложений
    Смотрим различия в абзацах. Возможна ситуация, что diff-абзацы пересекаются, на этот случай есть параметр beta
    Следующий diff абзац не раньше чем через beta абзацей
    '''
    d = Differ()
    old_text = old_rev.get_plain_text()
    new_text = new_rev.get_plain_text()
    
    sent_text_old = sentence_tokenizer(old_text)
    sent_text_new = sentence_tokenizer(new_text)
    
    old_sents = []
    for i in range(epsilon, len(sent_text_old) - epsilon):
        old_sents.append(' '.join(sent_text_old[(i-epsilon):(i+epsilon+1)]))
    if not old_sents:
        old_sents = [' '.join(sent_text_old)]
        
    new_sents = []
    for i in range(epsilon, len(sent_text_new) - epsilon):
        new_sents.append(' '.join(sent_text_new[(i-epsilon):(i+epsilon+1)]))
    if not new_sents:
        new_sents = [' '.join(sent_text_new)]
        
    
    result = []
    dif_result = list(DifflibParser(old_sents, new_sents))
    old_text, new_text, last_diff_id = [], [], -1000
    for dif_id, dif_line in enumerate(dif_result):
        if dif_line['code'] != DiffCode.SIMILAR:
            if np.abs(dif_id - last_diff_id) > beta:
                result.append(dif_line)  
                last_diff_id = dif_id
    return result

def get_changes(diffs):
    """
    Извлекаем текст различия абзацей
    """
    all_changes = {}
    for diff_id, diff_obj in enumerate(diffs):
        if diff_obj['code'] == DiffCode.RIGHTONLY:
            all_changes[diff_id] = ([diff_obj['line']], 'r')
        elif diff_obj['code'] == DiffCode.LEFTONLY:
            all_changes[diff_id] = ([diff_obj['line']], 'l')
        elif diff_obj['code'] == DiffCode.CHANGED:
            r_change = diff_obj['rightchanges']
            cur_ch = -10
            prev_ch = -10
            all_r_changes = []
            for ch in r_change:
                if prev_ch < 0:
                    prev_ch = ch
                    cur_ch = ch
                if np.abs(ch - cur_ch) > 1:
                    new_change = diff_obj['newline'][prev_ch:cur_ch+1]
                    all_r_changes.append(new_change)
                    prev_ch = ch
                cur_ch = ch
            new_change = diff_obj['newline'][prev_ch:cur_ch+1]
            all_r_changes.append(new_change)
            all_changes[diff_id] = (all_r_changes, 'c')
    return all_changes

In [5]:
dirs = sorted(os.listdir('data/pages'))

In [None]:
STATISTICS = {}
pbar = tqdm(dirs, total=len(dirs), desc='Statistics processing', leave=True, position=0)
for page_name_idx, page_name in enumerate(pbar):

    page_revisions = WikiPage().load_revisions(f"data/pages/{page_name}")
    rev_list = sorted(list(map(int, page_revisions)))
    
    STATISTICS[page_name] = {}    
    STATISTICS[page_name]['num of revisions'] = len(rev_list)
    
    page_docs_mapper_id2link = json.load(open(f'{DOCS_MAPPER_DIR}/{page_name}/id2link.json', "r"))
    page_docs_mapper_link2id = json.load(open(f'{DOCS_MAPPER_DIR}/{page_name}/link2id.json', "r"))
    STATISTICS[page_name]['num of document links'] = len(page_docs_mapper_link2id)
    
    STATISTICS[page_name]['diff info'] = {}
    
    for idx, (prev_rev_num, new_rev_num) in enumerate(zip(rev_list, rev_list[1:])):
        new_rev = page_revisions[new_rev_num]
        prev_rev = page_revisions[prev_rev_num]
        comment = new_rev.comment
            
        new_links, _ = new_rev.get_links()
        prev_links, _ = prev_rev.get_links()
        diff_links = links_diff(prev_links, new_links)
        
        STATISTICS[page_name]['diff info'][idx] = {}
        comment_exists = 1 if comment else 0
        STATISTICS[page_name]['diff info'][idx]['comment exists'] = comment_exists
        
        num_of_docs = [1 for name_link, url_link in diff_links if url_link in page_docs_mapper_link2id]
        STATISTICS[page_name]['diff info'][idx]['num of docs'] = len(num_of_docs)
        
        
        rev_diff = text_diff(prev_rev, new_rev)
        STATISTICS[page_name]['diff info'][idx]['num of diff abstracts'] = len(rev_diff)
        text_changes = get_changes(rev_diff)
        
        docs_has_changes_flags = [0 for i in range(len(diff_links))]
        change_in_doc_flags = [0 for i in range(len(text_changes))]
        
        any_docs_has_changes_flags = [0 for i in range(len(diff_links))]
        any_change_in_doc_flags = [0 for i in range(len(text_changes))]
        
        for diff_doc_id, (_, url_link) in enumerate(diff_links):
            if url_link not in page_docs_mapper_link2id:
                continue
            link_id = int(page_docs_mapper_link2id[url_link])
            file_text = ''
            if os.path.exists(f'{DOCS_DIR}/{page_name}/{link_id}.txt'):
                with open(f'{DOCS_DIR}/{page_name}/{link_id}.txt', 'r', encoding='utf-8') as f:
                    file_text = f.read()
                                
            for ch_num, (_, change_info) in enumerate(text_changes.items()):
                ALL_CHANGES_FOUND = 1
                ANY_CHANGES_FOUND = 0
                change_arr_text, change_status = change_info
                for change_text in change_arr_text:
                    if change_text not in file_text:
                        ALL_CHANGES_FOUND = 0
                    else:
                        ANY_CHANGES_FOUND = 1
                        
                docs_has_changes_flags[diff_doc_id] = ALL_CHANGES_FOUND
                change_in_doc_flags[ch_num] = ALL_CHANGES_FOUND
                
                any_docs_has_changes_flags[diff_doc_id] = ANY_CHANGES_FOUND
                any_change_in_doc_flags[ch_num] = ANY_CHANGES_FOUND
                    
        STATISTICS[page_name]['diff info'][idx]['ALL num of indep good diffs'] = sum(change_in_doc_flags)
        STATISTICS[page_name]['diff info'][idx]['ALL num of indep good docs'] = sum(docs_has_changes_flags)
        STATISTICS[page_name]['diff info'][idx]['ALL is perfect'] = 0
        if sum(change_in_doc_flags) == len(change_in_doc_flags):
            STATISTICS[page_name]['diff info'][idx]['ALL is perfect'] = 1
            
        STATISTICS[page_name]['diff info'][idx]['ANY num of indep good diffs'] = sum(any_change_in_doc_flags)
        STATISTICS[page_name]['diff info'][idx]['ANY num of indep good docs'] = sum(any_docs_has_changes_flags)
        STATISTICS[page_name]['diff info'][idx]['ANY is perfect'] = 0
        if sum(any_change_in_doc_flags) == len(any_change_in_doc_flags):
            STATISTICS[page_name]['diff info'][idx]['ANY is perfect'] = 1
        

Statistics processing:   0%|          | 0/925 [00:00<?, ?it/s]

In [7]:
len(STATISTICS)

925

In [8]:
docs2page = []
num_rev2page = []
docs2diff = []
diffs2dif = []
comment_exists = []
all_diff = []

all_perf2diff = []
any_perf2diff = []
normal = []
super_good = []

for pn in STATISTICS:
    docs2page.append(STATISTICS[pn]['num of document links'])
    num_rev2page.append(STATISTICS[pn]['num of revisions'])
    
    
    COMMENT_EXISTS = 0
    NUM_OF_DOCS = 0
    NUM_OF_DIFF_ABSTRACT = 0
    ALL_GOOD = 0
    ANY_GOOD = 0
    NORMAL = 0
    SUPER = 0
    for _, val in STATISTICS[pn]['diff info'].items():
        COMMENT_EXISTS += val['comment exists']
        NUM_OF_DOCS += val['num of docs']
        NUM_OF_DIFF_ABSTRACT += val['num of diff abstracts']
        ALL_GOOD += val['ALL is perfect']
        ANY_GOOD += val['ANY is perfect']
        if val['comment exists'] and val['ANY is perfect']:
            NORMAL += 1
        if val['comment exists'] and val['ALL is perfect']:
            SUPER += 1
            
        all_diff.append(1)
    
    normal.append(NORMAL)
    super_good.append(SUPER)
    docs2diff.append(NUM_OF_DOCS)
    diffs2dif.append(NUM_OF_DIFF_ABSTRACT)
    comment_exists.append(COMMENT_EXISTS)
    all_perf2diff.append(ALL_GOOD)
    any_perf2diff.append(ANY_GOOD)

In [9]:
print('DOCS PER PAGE: ', sum(docs2page), '/', len(STATISTICS), '=', sum(docs2page) / len(STATISTICS))
print('REVS PER PAGE: ', sum(num_rev2page), '/', len(STATISTICS), '=', sum(num_rev2page) / len(STATISTICS))
print()
print('DOCS PER DIFF: ', sum(docs2diff), '/', sum(all_diff), '=', sum(docs2diff) / sum(all_diff))
print('DIFFS PER DIFF: ', sum(diffs2dif), '/', sum(all_diff), '=', sum(diffs2dif) / sum(all_diff))
print('COMMS PER DIFF: ', sum(comment_exists), '/', sum(all_diff), '=', sum(comment_exists) / sum(all_diff))
print('ALL PER DIFF: ', sum(all_perf2diff), '/', sum(all_diff), '=', sum(all_perf2diff) / sum(all_diff))
print('ANY PER DIFF: ', sum(any_perf2diff), '/', sum(all_diff), '=', sum(any_perf2diff) / sum(all_diff))
print('NORMAL DIFF: ', sum(normal), '/', sum(all_diff), '=', sum(normal) / sum(all_diff))
print('SUPER DIFF: ', sum(super_good), '/', sum(all_diff), '=', sum(super_good) / sum(all_diff))

DOCS PER PAGE:  1787 / 925 = 1.931891891891892
REVS PER PAGE:  69286 / 925 = 74.90378378378378

DOCS PER DIFF:  2458 / 68361 = 0.035956173841810386
DIFFS PER DIFF:  88661 / 68361 = 1.2969529410043739
COMMS PER DIFF:  53065 / 68361 = 0.7762466903643891
ALL PER DIFF:  15282 / 68361 = 0.2235485145038838
ANY PER DIFF:  15350 / 68361 = 0.22454323371512996
NORMAL DIFF:  11117 / 68361 = 0.16262196281505537
SUPER DIFF:  11057 / 68361 = 0.16174426939336756
