In [7]:
from tqdm import tqdm

import gzip
import os

import random


d = "Corpus" 

In [8]:
import spacy
import spacy
from spacy import displacy
nlp = spacy.load("nl_core_news_lg")

In [9]:
import random

# Define NER Pipeline and Run on Texts

In [10]:
def load(file):
    with open(os.path.join(d, file), "r") as handle:
        return handle.read()
    
def preprocess(text):
    return text


relevant_labels = ["PERSON", "GPE", "ORG"]
def postprocess(doc_obj):
    return [e for e in doc_obj.ents if (e.label_ in relevant_labels) and len(str(e)) > 2]

In [11]:
files = random.sample(os.listdir(d), 1)

entity_lists = {}
texts = []
for f in tqdm(files):
    text = load(f)
    texts.append(text)
    entities = postprocess(
                    nlp(
                        preprocess(text)
                        )
                    )   
    
    entity_lists[f] = entities

100%|██████████| 1/1 [00:00<00:00,  4.02it/s]


In [12]:
all_ents = [e for e_ls in entity_lists.values() for e in e_ls]

# Inspect List of Entities and Distribution over it

In [26]:
for e in all_ents:
    print(e, "|", e.label_, "\n")

bepaald:/ | ORG 

testatrije delangstleevende | PERSON 

actien Obligatien | PERSON 

Eerffecten | PERSON 

siaminie | PERSON 

onzen gemeeneen | PERSON 

den ge | GPE 

beneeden | GPE 

voortloopen also den Eerst | PERSON 

deesen | PERSON 

overzulks den | PERSON 

jangst leevende | PERSON 

weerkaner | PERSON 

Heeren weesmee | PERSON 

gewiijtige kerdenen | PERSON 

Eerwaardens | PERSON 

Cas | PERSON 

deesen | PERSON 



In [27]:
from collections import Counter
import matplotlib.pyplot as plt

In [28]:
entity_counts = Counter(all_ents)

ranks, counts = list(zip(*enumerate(c for e, c in entity_counts.most_common())))


for ent, count in entity_counts.most_common(10):
    print(count, ent)

1 bepaald:/
1 testatrije delangstleevende
1 actien Obligatien
1 Eerffecten
1 siaminie
1 onzen gemeeneen
1 den ge
1 beneeden
1 voortloopen also den Eerst
1 deesen


# Create Index of Entities 
## i.e. Link Entities into Texts in which they appear

In [29]:
import pandas as pd

records = []

for (filename, entity_list), t in zip(entity_lists.items(), texts):
    for entity in entity_list:
        start, end = max(entity.start_char-10, 0), min(entity.end_char+10, len(t))
        rec = (str(entity), filename, entity.label_, t[start:end]) # , entity.start, entity.end)
        records.append(rec)
        
        
df = pd.DataFrame.from_records(records, columns=["entity_name", "filename", "entity_type", "snippet"])
    
df.to_csv("some_csv.csv")

In [34]:
df[df.entity_type == "PERSON"]

Unnamed: 0,entity_name,filename,entity_type,snippet
1,testatrije delangstleevende,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,aald:/\nde testatrije delangstleevende zijnde va
2,actien Obligatien,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,mits alle actien Obligatien vaste Eer
3,Eerffecten,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,ien vaste Eerffecten en andere
4,siaminie,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,siaven en siaminie juwelen\nv
5,onzen gemeeneen,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,naamd\ntot onzen gemeeneen boedel ge
8,voortloopen also den Eerst,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,er waarde voortloopen also den Eerst stergeend
9,deesen,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,teerd bij deesen en dat ov
10,overzulks den,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,en en dat overzulks den jangst le
11,jangst leevende,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,zulks den jangst leevende onzer\neen
12,weerkaner,txt_NL-HaNA_1.04.02_6847_1185.txt,PERSON,rd om ter weerkaner alhier\nte


In [30]:
df[df.entity_type == "PERSON"].to_csv("person_csv.csv")

# Things to DO

 - get distribution over entities
 - create actual index as CSV table
 - create basic knowledge graph 

---
## Use Jaccard Distance (Intersection over Union)

Compute overlap of all pairs of texts (in terms of Jaccard distance) and then establish the distribution over these overlaps. Could use that to identify texts that appear because the same physical page was scanned twice.

In [33]:
def jaccard(words1, words2):
    s1, s2 = set(words1), set(words2)
    return len(s1 & s2)/len(s1 | s2)


texts = {f: load(f) for f in files}


all_distances = []
for f1, t1 in texts.items():
    for f2, t2 in texts.items():
        d = jaccard(t1, t2)
        all_distances.append(d)
        print(f1, f2, d)


txt_NL-HaNA_1.04.02_6847_0013.txt txt_NL-HaNA_1.04.02_6847_0013.txt 1.0
txt_NL-HaNA_1.04.02_6847_0013.txt txt_NL-HaNA_1.04.02_6847_0012.txt 0.11475409836065574
txt_NL-HaNA_1.04.02_6847_0013.txt txt_NL-HaNA_1.04.02_6847_0011.txt 0.967741935483871
txt_NL-HaNA_1.04.02_6847_0012.txt txt_NL-HaNA_1.04.02_6847_0013.txt 0.11475409836065574
txt_NL-HaNA_1.04.02_6847_0012.txt txt_NL-HaNA_1.04.02_6847_0012.txt 1.0
txt_NL-HaNA_1.04.02_6847_0012.txt txt_NL-HaNA_1.04.02_6847_0011.txt 0.11475409836065574
txt_NL-HaNA_1.04.02_6847_0011.txt txt_NL-HaNA_1.04.02_6847_0013.txt 0.967741935483871
txt_NL-HaNA_1.04.02_6847_0011.txt txt_NL-HaNA_1.04.02_6847_0012.txt 0.11475409836065574
txt_NL-HaNA_1.04.02_6847_0011.txt txt_NL-HaNA_1.04.02_6847_0011.txt 1.0
