In [12]:
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score,f1_score, precision_score, recall_score, classification_report
import os
from tqdm import tqdm

In [13]:
import json

class ParlamentaryCorpus():

    def __init__(self, path): # Takes input normalized_token_file
        self.path = path
        self.sentence_dict = {}

    def load_data(self, lower=False):
        sentence_dict = self.sentence_dict
        with open(self.path) as infile:
            with open(self.path, encoding="UTF-8") as infile:
                json_input = json.load(infile)

        if lower == False:
            for chaos in json_input["sentences"]:
                token_list = []
                for token in chaos["tokens"]:
                    if token["special_status"] != None:
                        continue
                    else:
                        if " " in token["token_text"]:
                            token_list.extend(token["token_text"].split())
                        else:
                            token_list.append(token["token_text"])
                    sentence_dict[chaos["sentence_id"]] = token_list
        else:
            for chaos in json_input["sentences"]:
                token_list = []
                for token in chaos["tokens"]:
                    if token["special_status"] != None:
                        continue
                    else:
                        if " " in token["token_text"]:
                            token_list.extend(token["token_text"].lower().split())
                        else:
                            token_list.append(token["token_text"].lower())
                    sentence_dict[chaos["sentence_id"]] = token_list
                    
        return sentence_dict

In [14]:
def load_parl_corpus(rootdir_parl_corpus, lower=False):
    corpora_normal_cap, corpora_lower, paths = [], [], []
    
    for subdir, dirs, files in os.walk(rootdir_parl_corpus):
        for file in files:
            if "normalized_token_data.json" in file:
                path = (os.path.join(subdir, file))
                paths.append(path)

    for corpus, path in enumerate(paths):
        corpus = ParlamentaryCorpus(path)
        corpus = corpus.load_data()
        corpora_normal_cap.append(corpus)
        for k, v in corpus.items():
            if v == []:
                print(k)
                print(path)

    if lower==True:
        for corpus, path in enumerate(paths):
            corpus = ParlamentaryCorpus(path)
            corpus = corpus.load_data(lower=True)
            corpora_lower.append(corpus)


    return corpora_normal_cap, corpora_lower

In [15]:
""" Parliamentary corpus data loading stuff"""
nlp = spacy.load("nb_core_news_lg")
rootdir=r"C:\Users\Aarne\Desktop\Ferdig_code_folder\parl_corpus_full"
corpora_normal_cap, corpora_lower = load_parl_corpus(rootdir, lower=True)

In [16]:
# implementing bindestrek fix

pred_sentences = []
for diction in tqdm(corpora_lower):
    for k, sentence in diction.items():
        doc = nlp(" ".join(sentence))
        pred_sent = []
        for token in doc:
            ## Checking if "-" (bindestrek) in token. Such as in "Sør-Trøndelag"
            if token.ent_type_ != "": 
                if "-" in token.text:
                    s = []
                    for i, c in enumerate(token.text):
                        if i == 0:
                            s.append(c.capitalize())
                        elif c == "-":
                            s.append("-")
                        elif token.text[i-1] == "-":
                            s.append(c.capitalize())
                        else:
                            s.append(c)
                    pred_sent.append("".join(s))
                    continue
                    
                pred_sent.append(token.text.capitalize())
            else:
                pred_sent.append(token.text)
        pred_sentences.append(" ".join(pred_sent))

100%|██████████| 39/39 [03:20<00:00,  5.13s/it]


In [17]:
pred_annotations = []
for sentence in pred_sentences:
    annoation = []
    for token in sentence.split():
        if token[0].isupper(): # Checks if token has captial letter
            annoation.append(1)
            
        else:
            annoation.append(0) # Adds 0 if token has lower 
    pred_annotations.append((annoation))
            

In [18]:
gold_annotations = []
docs = []
for diction in tqdm(corpora_normal_cap):
    for k, word in diction.items():
        doc = nlp(" ".join(word))
        docs.append(doc)
        annoation = []
        for token in doc: 
            if token.text[0].isupper() == True: # Checks if token has captial letter
                annoation.append(1)
                
            else:
                annoation.append(0) # Adds 0 if token has lower 
        gold_annotations.append((annoation))

100%|██████████| 39/39 [03:19<00:00,  5.13s/it]


In [19]:
pred_sentences = []
for diction in corpora_lower[:1]:
    for k, sentence in diction.items():
        doc = nlp(" ".join(sentence))
        pred_sent = []
        for token in doc:
            ## Checking if "-" (bindestrek) in token. Such as in "Sør-Trøndelag"
            if token.ent_type_ != "": 
                if "-" in token.text:
                    s = []
                    for i, c in enumerate(token.text):
                        if i == 0:
                            s.append(c.capitalize())
                        elif c == "-":
                            s.append("-")
                        elif token.text[i-1] == "-":
                            s.append(c.capitalize())
                        else:
                            s.append(c)
                    pred_sent.append("".join(s))
                    continue
                    
                pred_sent.append(token.text.capitalize())
            else:
                pred_sent.append(token.text)
        pred_sentences.append(" ".join(pred_sent))

In [20]:
y_pred = [item for sublist  in pred_annotations for item in sublist]
y_true = [item for sublist  in gold_annotations for item in sublist]

In [None]:
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred, digits=4))