In [1]:
import json, glob
from textblob import TextBlob
from tqdm import tqdm
from collections import Counter
import numpy as np

Document dict contatins a list of dicts:
* content, retrieved_by, id
* meta: {text, doc_title, split_size, split_id, SPaR_labels, filtered_SPaR_labels, cluster_filtered, cluster_neighbours}

* Do I want to count tokens?

In [6]:
def count_labels(counter, len_counter, labels):
    for label in labels:
        counter[label] += 1
        len_counter[len(TextBlob(label).words)] += 1
    return counter, len_counter

def process_doc(doc_path, doc_titles_seen, doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt):
    with open(doc_path, 'r') as f:
        passages = [json.loads(l) for l in f.readlines()]
        page_nrs = []
        for passage in passages:
            doc_title = passage['meta']['doc_title']
            if doc_title in doc_titles_seen:
                raise ValueError(f"Document title already seen: {doc_title}\n Document: {doc_path}\n") 
                
            if "ocab" in doc_title:
                raise ValueError(f"Document is a vocabulary: {doc_title}\n Document: {doc_path}\n") 

            page_nr = int(passage['id'].rsplit('##', 2)[1])
            doc_title = passage['meta']['doc_title']
            text = passage['content']
            spar = [l for l in passage['meta']['SPaR_labels'][1:-1].split(", ") if l]
            f_spar = [l for l in passage['meta']['filtered_SPaR_labels'][1:-1].split(", ") if l]
            cluster = [l for l in passage['meta']['cluster_neighbours'][1:-1].split(", ") if l]
            f_cluster = [l for l in passage['meta']['cluster_filtered'][1:-1].split(", ") if l]

            blob = TextBlob(text)
            sents = blob.sentences
            # during indexing passages are cut off to be 100 words max
            sent_word_lens = []
            for sent in sents:
                sent_word_lens.append(len(sent.words)) if len(sent.words)  < 100 else sent_word_lens.append(100)
            words = [str(w) for w in blob.words]

            # counting
            doc_title_count[doc_title] += 1
            passage_cnt += 1
            u_passage_cnt[text] += 1
            passage_len_cnt[len(text)] += 1


            for sent in sents:
                sent_cnt[sent] += 1

            for num_words in sent_word_lens:
                sent_len_cnt[num_words] += 1

            for w in words:
                word_cnt[w] += 1
                word_len_cnt[len(w)] += 1

            spar_cnt, spar_word_len_cnt = count_labels(spar_cnt, spar_word_len_cnt, spar)
            f_spar_cnt, f_spar_word_len_cnt = count_labels(f_spar_cnt, f_spar_word_len_cnt, f_spar)
            nn_cnt, nn_word_len_cnt = count_labels(nn_cnt, nn_word_len_cnt, cluster)
            f_nn_cnt, f_nn_word_len_cnt = count_labels(f_nn_cnt, f_nn_word_len_cnt, f_cluster)
                
        page_nrs.append(page_nr)
        doc_pages_for_title[max(page_nrs)] += 1
        doc_titles_seen.append(doc_title)

    return doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, doc_titles_seen


def statistics_per_split(converted_files_directory):
    # grab processed .json files
    if not converted_files_directory.endswith("/"):
        converted_files_directory = converted_files_directory + "/"
    documents = glob.glob(converted_files_directory + "**/*.json", recursive=True)

    # init counters
    doc_count = 0
    duplicates = []
    doc_title_count = Counter()
    doc_pages_for_title = Counter()
    
    passage_cnt = 0 
    u_passage_cnt = Counter() 
    passage_len_cnt = Counter()
    sent_cnt = Counter()
    sent_len_cnt = Counter()
    word_cnt = Counter()
    word_len_cnt = Counter()
    
    spar_cnt = Counter()
    spar_word_len_cnt = Counter()
    f_spar_cnt = Counter()
    f_spar_word_len_cnt = Counter()
    nn_cnt = Counter()
    nn_word_len_cnt = Counter()
    f_nn_cnt = Counter()
    f_nn_word_len_cnt = Counter()
    

    
    doc_titles_seen = []
    for doc_path in tqdm(documents):
        doc_count += 1
        try:
            doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, doc_titles_seen = process_doc(doc_path, doc_titles_seen, doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,
                                        sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, 
                                        f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt)
        except ValueError:
            # doc_title already seen
            duplicates.append(doc_path)
            continue
    
    print(f"Skipped {len(duplicates)} duplicates")
    return doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt,\
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, duplicates

In [7]:
directory = "datavolume/ir_data/pdf_converted"

doc_count, doc_pages_for_title, doc_title_count, passage_cnt, u_passage_cnt, passage_len_cnt, \
            sent_cnt, sent_len_cnt, word_cnt, word_len_cnt, spar_cnt, spar_word_len_cnt, f_spar_cnt, \
            f_spar_word_len_cnt, nn_cnt, nn_word_len_cnt, f_nn_cnt, f_nn_word_len_cnt, duplicates = statistics_per_split(directory)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 471/471 [17:29<00:00,  2.23s/it]

Skipped 70 duplicates





In [10]:
page_lengths = [v for page_lens, c in doc_pages_for_title.items() for v in [page_lens]*c]
print("nr of documents: {}".format(sum(doc_pages_for_title.values())))
print("% of documents ≥ 100 pages: {:.2f}%".format(
    (len([v for v in page_lengths if v > 100]) / len(page_lengths))*100))
print("Mean doc length (pages): {:.2f}".format(np.mean(page_lengths)))
print("Standard deviation length (pages): {:.2f}".format(np.std(page_lengths)))
print("Shortest doc (pages): {}".format(np.min(page_lengths)))
print("Longest doc (pages): {}".format(np.max(page_lengths)))

nr of documents: 407
% of documents ≥ 100 pages: 13.27%
Mean doc length (pages): 62.85
Standard deviation length (pages): 62.07
Shortest doc (pages): 3
Longest doc (pages): 549


In [5]:
print(f"Passages: {passage_cnt} ({sum(u_passage_cnt.values())} unique) passages found in {doc_count} documents")
print(len(doc_title_count.keys()), "unique doc titles and total: ", sum(doc_title_count.values()))

print(f"Expected nr of words (100 * passages): {passage_cnt * 100}")


Passages: 87998 (87998 unique) passages found in 471 documents
407 unique doc titles and total:  87998
Expected nr of words (100 * passages): 8799800


In [6]:
# sentences
sent_lenghts = [v for sent_lens, c in sent_len_cnt.items() for v in [sent_lens]*c if v > 0]
print("# Sentences: {}".format(len(sent_lenghts)))
print("Unique sentences: {}".format(len(sent_cnt.keys())))
cut_off_len = 80
print("% of sents ≥ {} words: {:.2f}%".format(cut_off_len,
    (len([v for v in sent_lenghts if v > cut_off_len]) / len(sent_lenghts)) *100))

print("Mean sentence length (words): {:.2f}".format(np.mean(sent_lenghts)))
print("Standard deviation length (words): {:.2f}".format(np.std(sent_lenghts)))
print("Shortest sentence: {}".format(np.min(sent_lenghts)))
print("Longest sentence: {}".format(np.max(sent_lenghts)))

# Sentences: 296701
Unique sentences: 247766
% of sents ≥ 80 words: 3.81%
Mean sentence length (words): 23.93
Standard deviation length (words): 20.10
Shortest sentence: 1
Longest sentence: 100


In [7]:
# words
print(f"Vocabulary size: {len(word_cnt.keys())}, total nr of `words`: {sum(word_cnt.values())}")
word_lenghts = [v for word_lens, c in word_len_cnt.items() for v in [word_lens]*c]
cut_off_len = 10
print("% of words ≥ {} characters: {:.2f}%".format(cut_off_len,
    (sum([v for v in word_lenghts if v > cut_off_len]) / sum(word_lenghts))* 100))
print("Mean word length (chars): {:.2f}".format(np.mean(word_lenghts)))
print("Standard deviation length (chars): {:.2f}".format(np.std(word_lenghts)))
print("Shortest word (chars): {}".format(np.min(word_lenghts)))
print("Longest word (chars): {}".format(np.max(word_lenghts)))

Vocabulary size: 126359, total nr of `words`: 7105215
% of words ≥ 10 characters: 15.46%
Mean word length (chars): 5.11
Standard deviation length (chars): 3.40
Shortest word (chars): 1
Longest word (chars): 387


In [8]:
# label lengths
spar = [v for label_len, c in spar_word_len_cnt.items() for v in [label_len]*c if v > 0]
print(f"SPaR label vocab size: {len(spar_cnt.keys())}, total nr of labels: {sum(spar_cnt.values())}")
print("Mean length (words): {:.2f}".format(np.mean(spar)))
print("Standard deviation length (words): {:.2f}".format(np.std(spar)))
print("Shortest label (words): {}".format(np.min(spar)))
print("Longest label (words: {}\n".format(np.max(spar)))

f_spar = [v for label_len, c in f_spar_word_len_cnt.items() for v in [label_len]*c if v > 0]
print(f"Filtered SPaR label vocab size: {len(f_spar_cnt.keys())}, total nr of labels: {sum(f_spar_cnt.values())}")
print("Mean length (words): {:.2f}".format(np.mean(f_spar)))
print("Standard deviation length (words): {:.2f}".format(np.std(f_spar)))
print("Shortest label (words): {}".format(np.min(f_spar)))
print("Longest label (words): {}\n".format(np.max(f_spar)))

nn = [v for label_len, c in nn_word_len_cnt.items() for v in [label_len]*c if v > 0]
print(f"Cluster label vocab size: {len(nn_cnt.keys())}, total nr of labels: {sum(nn_cnt.values())}")
print("Mean length (words): {:.2f}".format(np.mean(nn)))
print("Standard deviation length (words): {:.2f}".format(np.std(nn)))
print("Shortest label (words): {}".format(np.min(nn)))
print("Longest label (words): {}\n".format(np.max(nn)))

f_nn = [v for label_len, c in f_nn_word_len_cnt.items() for v in [label_len]*c if v > 0]
print(f"Filtered cluster label vocab size: {len(f_nn_cnt.keys())}, total nr of labels: {sum(f_nn_cnt.values())}")
print("Mean length (words): {:.2f}".format(np.mean(f_nn)))
print("Standard deviation length (words): {:.2f}".format(np.std(f_nn)))
print("Shortest label (words): {}".format(np.min(f_nn)))
print("Longest label (words): {}\n".format(np.max(f_nn)))


SPaR label vocab size: 561405, total nr of labels: 2221966
Mean length (words): 2.21
Standard deviation length (words): 1.73
Shortest label (words): 1
Longest label (words: 68

Filtered SPaR label vocab size: 440430, total nr of labels: 1684454
Mean length (words): 2.14
Standard deviation length (words): 1.51
Shortest label (words): 1
Longest label (words): 68

Cluster label vocab size: 342265, total nr of labels: 5017893
Mean length (words): 2.64
Standard deviation length (words): 1.49
Shortest label (words): 1
Longest label (words): 65

Filtered cluster label vocab size: 278262, total nr of labels: 2793678
Mean length (words): 2.94
Standard deviation length (words): 1.81
Shortest label (words): 1
Longest label (words): 65



In [11]:
[print(d.rsplit("/",1)[1]) for d in duplicates]

BS 4422 (2005).json
BS EN ISO 1182-2010--[Reaction to fire tests for products - no Combustibility].json
BS 6100-6-2008.json
BS EN 81-58 (2018).json
BS 476-7-1997--[2019-08-27--02-01-39 PM].json
BS 7974 (2019).json
BS EN 13501-5-2016.json
BS EN 594-2011.json
BS EN 1993-1-2 (2005).json
BS 8313 (1997).json
BS EN 1365-3-2000--Beams.json
Approved Document B - 2010.json
BS EN 520-2004+A1-2009--[2019-09-09--05-29-22 PM].json
BS EN 1365-5-2004--[2019-07-25--04-50-45 PM].json
BS 6100-0-2010.json
BS EN 1365-3-2000--[2019-07-25--04-47-42 PM].json
BS EN 12114-2000--[Air permeability of builing elements. Laboratory tests].json
BS EN ISO 9346-2007--[2019-09-16--10-40-16 AM].json
BS EN 1993-1-2 (2005)-UK National Annex to Eurocode 3.json
BS EN 1634-3-2004--[2019-09-09--05-23-56 PM].json
BS EN 1634-1 (2014) + A1 2018.json
BS 8218-1998.json
DD ENV 1363-3-2000--Verification of furnace.json
BS EN 13501-1-2018.json
BS 6100-3-2007.json
BS EN 1366-2 (2015).json
BS 476-3 (2004).json
BS EN 832-2000--[Thermal 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]