In [4]:
import json, glob
from textblob import TextBlob
from tqdm import tqdm
from collections import Counter
import numpy as np
from pathlib import Path

Document dict contatins a list of dicts:
* content, retrieved_by, id
* meta: {text, doc_title, split_size, split_id, SPaR_labels, filtered_SPaR_labels, cluster_filtered, cluster_neighbours}

* Do I want to count tokens?

In [27]:
def count_labels(counter, len_counter, labels):
    for label in labels:
        counter[label] += 1
        len_counter[len(TextBlob(label).words)] += 1
    return counter, len_counter

def process_doc(doc_path, doc_titles_seen, doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt):
    with open(doc_path, 'r') as f:
        passages = [json.loads(l) for l in f.readlines()]
        page_nrs = []
        for passage in passages:
            doc_title = passage['meta']['doc_title']
            if doc_title in doc_titles_seen:
                raise ValueError(f"Document title already seen: {doc_title}\n Document: {doc_path}\n") 
                
            page_nr = int(passage['id'].rsplit('##', 2)[1])
            doc_title = passage['meta']['doc_title']
            text = passage['content']
            spar = [l for l in passage['meta']['NER_labels'][1:-1].split(", ") if l]
            f_spar = [l for l in passage['meta']['filtered_NER_labels'][1:-1].split(", ") if l]
            cluster = [l for l in passage['meta']['filtered_NER_labels_domains'][1:-1].split(", ") if l]
            f_cluster = [l for l in passage['meta']['neighbours'][1:-1].split(", ") if l]

            blob = TextBlob(text)
            sents = blob.sentences
            # during indexing passages are cut off to be 100 words max
            sent_word_lens = []
            for sent in sents:
                sent_word_lens.append(len(sent.words)) if len(sent.words)  < 100 else sent_word_lens.append(100)
            words = [str(w) for w in blob.words]

            # counting
            doc_title_count[doc_title] += 1
            passage_cnt += 1
            u_passage_cnt[text] += 1
            passage_len_cnt[len(text)] += 1


            for sent in sents:
                sent_cnt[sent] += 1

            for num_words in sent_word_lens:
                sent_len_cnt[num_words] += 1

            for w in words:
                word_cnt[w] += 1
                word_len_cnt[len(w)] += 1

            spar_cnt, spar_word_len_cnt = count_labels(spar_cnt, spar_word_len_cnt, spar)
            f_spar_cnt, f_spar_word_len_cnt = count_labels(f_spar_cnt, f_spar_word_len_cnt, f_spar)
            nn_cnt, nn_word_len_cnt = count_labels(nn_cnt, nn_word_len_cnt, cluster)
            f_nn_cnt, f_nn_word_len_cnt = count_labels(f_nn_cnt, f_nn_word_len_cnt, f_cluster)
                
        page_nrs.append(page_nr)
        doc_pages_for_title[max(page_nrs)] += 1
        doc_titles_seen.append(doc_title)

    return doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, doc_titles_seen


def statistics_per_split(converted_files_directory):
    # grab processed .json files

    documents = [x for x in converted_files_directory.glob("*.json")]

    # init counters
    doc_count = 0
    duplicates = []
    doc_title_count = Counter()
    doc_pages_for_title = Counter()
    
    passage_cnt = 0 
    u_passage_cnt = Counter() 
    passage_len_cnt = Counter()
    sent_cnt = Counter()
    sent_len_cnt = Counter()
    word_cnt = Counter()
    word_len_cnt = Counter()
    
    spar_cnt = Counter()
    spar_word_len_cnt = Counter()
    f_spar_cnt = Counter()
    f_spar_word_len_cnt = Counter()
    nn_cnt = Counter()
    nn_word_len_cnt = Counter()
    f_nn_cnt = Counter()
    f_nn_word_len_cnt = Counter()
    

    
    doc_titles_seen = []
    for doc_path in tqdm(documents):
        doc_count += 1
        try:
            doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, doc_titles_seen = process_doc(doc_path, doc_titles_seen, doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,
                                        sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, 
                                        f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt)
        except ValueError:
            # doc_title already seen
            duplicates.append(doc_path)
            continue
    
    print(f"Skipped {len(duplicates)} duplicates")
    return doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, duplicates

In [28]:
directory = Path("datavolume/ir_data/foreground_pdf_converted/")

doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt, \
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, duplicates = statistics_per_split(directory)

100%|██████████████████████████████████████████████████████████████| 420/420 [44:14<00:00,  6.32s/it]


Skipped 0 duplicates


In [29]:
doc_count

420

In [30]:
page_lengths = [v for page_lens, c in doc_pages_for_title.items() for v in [page_lens]*c]
print("nr of documents: {}".format(sum(doc_pages_for_title.values())))
print("% of documents ≥ 100 pages: {:.2f}%".format(
    (len([v for v in page_lengths if v > 100]) / len(page_lengths))*100))
print("Mean doc length (pages): {:.2f}".format(np.mean(page_lengths)))
print("Standard deviation length (pages): {:.2f}".format(np.std(page_lengths)))
print("Shortest doc (pages): {}".format(np.min(page_lengths)))
print("Longest doc (pages): {}".format(np.max(page_lengths)))

nr of documents: 420
% of documents ≥ 100 pages: 14.29%
Mean doc length (pages): 63.71
Standard deviation length (pages): 64.57
Shortest doc (pages): 3
Longest doc (pages): 548


In [31]:
print(f"Passages: {passage_cnt} ({sum(u_passage_cnt.values())} unique) passages found in {doc_count} documents")
print(len(doc_title_count.keys()), "unique doc titles and total: ", sum(doc_title_count.values()))

print(f"Expected nr of words (100 * passages): {passage_cnt * 100}")


Passages: 287876 (287876 unique) passages found in 420 documents
420 unique doc titles and total:  287876
Expected nr of words (100 * passages): 28787600


In [36]:
# sentences
sent_lenghts = [v for sent_lens, c in sent_len_cnt.items() for v in [sent_lens]*c if v > 0]
print("# Sentences: {}".format(len(sent_lenghts)))
print("Unique sentences: {}".format(len(sent_cnt.keys())))
cut_off_len = 80
print("% of sents ≥ {} words: {:.2f}%".format(cut_off_len,
    (len([v for v in sent_lenghts if v > cut_off_len]) / len(sent_lenghts)) *100))

print("Mean sentence length (words): {:.2f}".format(np.mean(sent_lenghts)))
print("Standard deviation length (words): {:.2f}".format(np.std(sent_lenghts)))
print("Shortest sentence: {}".format(np.min(sent_lenghts)))
print("Longest sentence: {}".format(np.max(sent_lenghts)))

# Sentences: 741180
Unique sentences: 213703
% of sents ≥ 80 words: 1.44%
Mean sentence length (words): 25.24
Standard deviation length (words): 17.52
Shortest sentence: 1
Longest sentence: 100


In [33]:
# words
print(f"Vocabulary size: {len(word_cnt.keys())}, total nr of `words`: {sum(word_cnt.values())}")
word_lenghts = [v for word_lens, c in word_len_cnt.items() for v in [word_lens]*c]
cut_off_len = 10
print("% of words ≥ {} characters: {:.2f}%".format(cut_off_len,
    (sum([v for v in word_lenghts if v > cut_off_len]) / sum(word_lenghts))* 100))
print("Mean word length (chars): {:.2f}".format(np.mean(word_lenghts)))
print("Standard deviation length (chars): {:.2f}".format(np.std(word_lenghts)))
print("Shortest word (chars): {}".format(np.min(word_lenghts)))
print("Longest word (chars): {}".format(np.max(word_lenghts)))

Vocabulary size: 121161, total nr of `words`: 18712290
% of words ≥ 10 characters: 15.68%
Mean word length (chars): 5.14
Standard deviation length (chars): 3.38
Shortest word (chars): 1
Longest word (chars): 293


In [38]:
def print_counts(label_name:str, cnt:Counter, word_len_cnt:Counter):
    counts = [v for label_len, c in word_len_cnt.items() for v in [label_len]*c if v > 0]
    
    print(f"{label_name} vocab size: {len(cnt.keys())}, total nr of labels: {sum(cnt.values())}")
    print("Mean length (words): {:.2f}".format(np.mean(counts)))
    print("Standard deviation length (words): {:.2f}".format(np.std(counts)))
    print("Shortest label (words): {}".format(np.min(counts)))
    print("Longest label (words: {}\n".format(np.max(counts)))
    

In [39]:
# label lengths
print_counts("NER", spar_cnt, spar_word_len_cnt)
print_counts("filtered NER", f_spar_cnt, f_spar_word_len_cnt)
print_counts("domain NER", nn_cnt, nn_word_len_cnt)
print_counts("NNs", f_nn_cnt, f_nn_word_len_cnt)

NER vocab size: 568926, total nr of labels: 5840598
Mean length (words): 2.58
Standard deviation length (words): 3.52
Shortest label (words): 1
Longest label (words: 340

filtered NER vocab size: 42258, total nr of labels: 1629792
Mean length (words): 1.27
Standard deviation length (words): 0.53
Shortest label (words): 1
Longest label (words: 9

domain NER vocab size: 45894, total nr of labels: 1278640
Mean length (words): 1.27
Standard deviation length (words): 0.53
Shortest label (words): 1
Longest label (words: 9

NNs vocab size: 46287, total nr of labels: 3307190
Mean length (words): 1.48
Standard deviation length (words): 0.62
Shortest label (words): 1
Longest label (words: 9



In [45]:
len([k for k, v in nn_cnt.items() if k.strip()])

45893

In [46]:
len([k for k, v in f_spar_cnt.items() if k.strip()])

42257

In [35]:
print(len([d for d in duplicates]))
[print(d.stem) for d in duplicates]    # these are all the vocabularies... did I remove those?

0


[]