### Importing spaCy and loading our texts and annotations

In [115]:
import spacy

nlp = spacy.load("nl_core_news_lg")

In [116]:
from tqdm import tqdm

import gzip
import os

import random

import pandas

corpus = "./Corpus/"
anns = "./Annotations/"

def load(file, corpus, lines=False):
    with open(os.path.join(corpus, file), "r") as handle:
        if lines:
            return list(handle) # handle.readlines()
        else:
            return handle.read()

### Using csv to read BRAT annotations



In [117]:

def read_csv(file, corpus=anns):
    file = load(file, corpus, lines=True)
    for line in file:
        tab_cols = line.split("\t")
        
        if tab_cols[0].startswith("T"):
            last = tab_cols[-1].strip()
            first = tab_cols[0]
            
            middle = tab_cols[1].split()
#             print(middle, middle[0], middle[-1])
            middle = [middle[0], middle[1], middle[-1]]
            
            yield (first, *middle, last)
            
def filter_rows(rows):
    for r in rows:
        if r[1].upper() in ["WOMEN", "INDIGENOUS", "MEN"]:
        #if r[1].upper() in ["PERSON", "ORG", "GPE", "WOMEN", "INDIGENOUS", "MEN"]:
            yield r
        
def change_rows(rows):
    for r in rows:
        entity_nr, entity_type, start, end, label = r
        yield int(start), int(end), entity_type.upper(), label
        


In [118]:
data = {}
listallpeoples = []

for ann_f in os.listdir(anns):
    if len(load(ann_f, anns)) > 0:
        cur_name = ann_f.strip(".ann")
    
        txt_f = cur_name + ".txt"
        if os.path.isfile(os.path.join(corpus, txt_f)):
        
            #raw_text = load(txt_f, corpus)
            brat_entity_list = list(change_rows(filter_rows(read_csv(ann_f, anns))))
            
            data[txt_f] = (brat_entity_list)
        else:
            print("no file", txt_f)
    else:
        print("annotation file empty")

no file gitkeep.txt
annotation file empty


In [119]:
data

{'NL-HaNA_1.04.02_6847_0758.txt': [(1005, 1009, 'MEN', 'Heer'),
  (1207, 1217, 'WOMEN', 'huifsvrouw'),
  (1218, 1223, 'WOMEN', 'mejuf'),
  (1182, 1186, 'MEN', 'Heer')],
 'NL-HaNA_1.04.02_6847_0016.txt': [(830, 833, 'MEN', 'm:r'),
  (1791, 1800, 'WOMEN', 'Erfgename'),
  (1824, 1830, 'INDIGENOUS', 'slaaff'),
  (1844, 1852, 'INDIGENOUS', 'Slavinne'),
  (1844, 1852, 'WOMEN', 'Slavinne'),
  (1924, 1931, 'INDIGENOUS', 'slaeven'),
  (1936, 1940, 'WOMEN', 'haar'),
  (1973, 1977, 'WOMEN', 'haar')],
 'NL-HaNA_1.04.02_6847_0017.txt': [(122, 128, 'WOMEN', 'dogter'),
  (262, 265, 'MEN', 'hij'),
  (421, 423, 'WOMEN', 'zy'),
  (429, 434, 'WOMEN', 'haker'),
  (573, 579, 'WOMEN', 'Jnsf=m'),
  (606, 610, 'WOMEN', 'wed:'),
  (727, 730, 'MEN', 'zyn'),
  (917, 921, 'WOMEN', 'haar'),
  (1442, 1447, 'MEN', 'mons:')],
 'NL-HaNA_1.04.02_6847_0759.txt': [],
 'NL-HaNA_1.04.02_6847_0029.txt': [(590, 596, 'WOMEN', 'moeden'),
  (597, 604, 'WOMEN', 'Juff=mo'),
  (628, 633, 'WOMEN', 'wed:e'),
  (1054, 1072, 'WOMEN', 

In [120]:
a = data.values()
l = list(a)

In [121]:
l

[[(1005, 1009, 'MEN', 'Heer'),
  (1207, 1217, 'WOMEN', 'huifsvrouw'),
  (1218, 1223, 'WOMEN', 'mejuf'),
  (1182, 1186, 'MEN', 'Heer')],
 [(830, 833, 'MEN', 'm:r'),
  (1791, 1800, 'WOMEN', 'Erfgename'),
  (1824, 1830, 'INDIGENOUS', 'slaaff'),
  (1844, 1852, 'INDIGENOUS', 'Slavinne'),
  (1844, 1852, 'WOMEN', 'Slavinne'),
  (1924, 1931, 'INDIGENOUS', 'slaeven'),
  (1936, 1940, 'WOMEN', 'haar'),
  (1973, 1977, 'WOMEN', 'haar')],
 [(122, 128, 'WOMEN', 'dogter'),
  (262, 265, 'MEN', 'hij'),
  (421, 423, 'WOMEN', 'zy'),
  (429, 434, 'WOMEN', 'haker'),
  (573, 579, 'WOMEN', 'Jnsf=m'),
  (606, 610, 'WOMEN', 'wed:'),
  (727, 730, 'MEN', 'zyn'),
  (917, 921, 'WOMEN', 'haar'),
  (1442, 1447, 'MEN', 'mons:')],
 [],
 [(590, 596, 'WOMEN', 'moeden'),
  (597, 604, 'WOMEN', 'Juff=mo'),
  (628, 633, 'WOMEN', 'wed:e'),
  (1054, 1072, 'WOMEN', 'natuurlijke dogter')],
 [(139, 145, 'WOMEN', 'moeder'),
  (298, 301, 'WOMEN', 'zij'),
  (416, 420, 'WOMEN', 'haar'),
  (421, 427, 'WOMEN', 'moeder'),
  (428, 436, '

### Terms for men, women and indigenous

In [122]:
terms_men = []
for x in l:
    for y in x:
        if y[2] == 'MEN':
            terms_men.append(y[3])
terms_men = set(terms_men)

In [123]:
print(terms_men)

{'zoon', 'heere', 'sijn', 'Zoon', 'm:', 'Pieter Hendrik Popijwoud', 'dE', 'schoon Zoon', 'heer', 'haer', 'broeders', 'Iongman', 'weesm:', 'man', 'wager', 'heer m:r', 'meester', 'neeven', 'zoonen', 'Heeren', 'Heere', 'mons:', 'sijnen', 'M:r', 'zyn', 'Eerw: heer', 'mons', 'zoons', 'De Heer', 'zijner', 'agt: heeren', 'Heer', 'oom', 'Eerw: heeren', 'broeder', 'soon', 'hij', 'hem', 'zijn', 'm:r', 'VE:', 'voogd', 'vader', 'heeren', 'Mons:r', 'booeder', 'mons:r', 'zoontje', 'opperm:', 'dE:', 'Ionge'}


In [124]:
terms_women=[]
for x in l:
    for y in x:
        if y[2] == 'WOMEN':
            terms_women.append(y[3])
terms_women = set(terms_women)

In [125]:
print(terms_women)

{'haar moeder', 'Clara Geertruijda schoender', 'zy', 'moeder', 'moeden', 'Pijs=m', 'mesust=m', 'doopdogter', 'Jntt=e', 'Iup=m', 'pebronella Iohanna Brouwer', 'Just=', 'Sister', 'Jnsf=m', 'testatrice', 'Cuthanna man Popijn', 'haker', 'Juff=r', 'mesust=r', 'wese', 'Anna Elisabeth', 'wilhelmina Elisabeth', 'wed=e', 'zuiders', 'vrouwe', 'groot moeder', 'wed:e', 'huijsvrouw', 'wij', 'Cuthanna man Copijn', 'dogter', 'huijs„ vrouw', 'haer', 'mejnst=m', 'puysvrouw', 'haarer', 'prysvrou', 'vrije fristen vrouw', 'Ins=m', 'mejuf', 'nigt', 'Slavinne', 'Just=m', 'machome', 'Wilhelmina Elrabesh', 'ngt', 'JnW=m', 'juff=en', 'Mejufrouw', 'Juff„', 'Susters', 'huijsv:r', 'natuurlijke dogter', 'wed:', 'zuster', 'Jongste dogter', 'Juff=mo', 'harer', 'Iuffrouw', 'suster', 'Cnthanna mara Topijn', 'me jnst=m', 'Hluste', 'Dist=m', 'huijs vrouw', 'huifsvrouw', 'geertruijda Catharina', 'hare', 'susters', 'Jnisvrouw', 'vrije Cristen vion', 'Johanna menia rotgers', 'Sara maria', 'Jn p=m', 'zij', 'vrouw', 'Elisabe

In [126]:
terms_indigenous=[]
for x in l:
    for y in x:
        if y[2] == 'INDIGENOUS':
            terms_indigenous.append(y[3])
terms_indigenous=set(terms_indigenous)

In [128]:
print(terms_indigenous)

{'slaaven', 'Pavinnen', 'sla „kinnen', 'vrije Christen vrouw', 'lijf eigenen', 'vrije Chrisle vrouw', 'slaaff', 'De moor', 'slaven, nen', 'lijf Eigenen', 'vrije Cristen vion', 'sla„ vernije', 'Enlandsche Crietene', 'slavinne', 'vrije fristen vrouw', 'lijfeigenen', 'inlanderen', 'Slavinne', 'slaaf', 'lijff Eigenen', 'slaven', 'vrije Ionge', 'Njai Rentje', 'maleijdse vrouw', 'slavinnen', 'vrije Cristene vrouw', 'Stariynen', 'mandadoor de vrije', 'lijf eijdenen', 'Vai', 'Lijfflijgenen', 'sla„ verrige', 'slaeven', 'sla„ verrije', 'Saroenie', 'lijf eijgenen', 'Leo', 'lijf eidenen'}


### Creating an index for women, indigenous (make CSV)
### Counting instances of indigenous as compared with number of pages


In [19]:
test = "./TestCorpus"


In [43]:
import pandas as pd
records = []


### other stats
