In [68]:
import conllu
import os
import pandas as pd


In [69]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [70]:
path = "/Users/aarnes/Documents/GitHub/MA_Peter-R-ysland-Aarnes/NorBERT2/all_conllu/{0}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]

dev_split_no = convert_to_list_dict(path, file_list[0])
test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])

dev_split_ny = convert_to_list_dict(path, file_list[3])
test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])

print("Combining train, dev and test sets..")
dev_split = dev_split_no + dev_split_ny
test_split = test_split_no + test_split_ny
train_split = train_split_no + train_split_ny
print("Success!")

Converting no_bokmaal-ud-dev to list of dictionaries
     2410 elements converted..
Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_bokmaal-ud-train to list of dictionaries
     15696 elements converted..
Converting no_nynorsk-ud-dev to list of dictionaries
     1890 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Converting no_nynorsk-ud-train to list of dictionaries
     14174 elements converted..
Combining train, dev and test sets..
Success!


In [71]:
datasets = dev_split + test_split + train_split


In [115]:


def avg_len_sen(dataset):
    len_count = []
    sent_count = 0
    for item in dataset:
        for sentence in item["ner_tags"]:
            len_count.append(len(sentence))
        sent_count += 1

    avg = round(sum(len_count)/sent_count, 2)
    print(avg)


In [120]:
avg_len_sen(train_split)
avg_len_sen(dev_split)
avg_len_sen(test_split)
avg_len_sen(datasets)

21.55
20.3
20.63
21.33


In [112]:
for i in dev_split["ner_tags"]:
    print(i)

TypeError: list indices must be integers or slices, not str

In [72]:
ner_tags = []
for item in datasets:
    ner_tags.append(item["ner_tags"])

In [73]:
tag_dit = {"PER":[],
            "ORG":[],
            "GPE_LOC":[],
            "GPE_ORG":[],
            "PROD":[],
            "LOC":[],
            "DRV":[],
            "EVT":[],
            "MISC":[]}

In [74]:
for sentence in ner_tags:
    tag_len = 0
    for tag in sentence:
        tag_name = tag[2:]
        if tag[0] == "B":
            if tag_len != 0:
                tag_dit[tag_name].append(tag_len)
            tag_len=1     
        elif tag[0] == "I":
            tag_len += 1
        

In [75]:
len(tag_dit["PER"])

3281

In [76]:
tag_dict = {"PER":[], "ORG":[], "GPE_LOC":[], "GPE_ORG":[], "PROD":[], "LOC":[], "DRV":[], "EVT":[], "MISC":[]}

def update_tag_dict(tag_dict, ner_tags):
    for tag_list in ner_tags:
        for i, tag in enumerate(tag_list):
            if tag.startswith("B-"):
                # Start of a new span
                entity_type = tag[2:]
                span_length = 1
                
                # Keep iterating until we reach the end of the span or the end of the sentence
                j = i + 1
                while j < len(tag_list) and tag_list[j] == "I-" + entity_type:
                    span_length += 1
                    j += 1
                
                # Add the span length to the appropriate list in the dictionary
                tag_dict[entity_type].append(span_length)
                
    return tag_dict


In [77]:
tag_dict = update_tag_dict(tag_dict, ner_tags)

In [78]:
for type, values in tag_dict.items():
    print(f"{type}   {round(sum(values)/len(values), 2)}")

PER   1.58
ORG   1.38
GPE_LOC   1.08
GPE_ORG   1.09
PROD   2.01
LOC   1.3
DRV   1.19
EVT   1.55
MISC   1.25


In [82]:
"""LOADING NPSC SAMPLES """


import csv

with open('/Users/aarnes/Documents/GitHub/MA_Peter-R-ysland-Aarnes/IOB NPSC sample/200_annoterte_setninger.csv', newline='', encoding="UTF-8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    left_col = []
    right_col = []
    for row in csvreader:
        left_col.append(row[0])
        right_col.append(row[1])

x, y = [], []

with open("/Users/aarnes/Documents/GitHub/MA_Peter-R-ysland-Aarnes/IOB NPSC sample/parl_annotation_comma_fixed 100 sents.csv", "r", encoding="UTF-8") as f_input:
    reader = csv.reader(f_input, delimiter=",")
    for i, row in enumerate(reader):
        if i%2:
            y.append(row)
        else:
            x.append(row)


def create_sublists(lst):
    sublists = []
    sublist = []
    for element in lst:
        if element == '':
            if sublist:
                sublists.append(sublist)
                sublist = []
        else:
            sublist.append(element)

    if sublist:
        sublists.append(sublist)

    return sublists

y200 = create_sublists(right_col)
x200 = create_sublists(left_col)


def lower_list_of_lists(lists):
    return [[word.lower() for word in sublist] for sublist in lists]


In [83]:
y = y + y200

In [85]:
len(y)

300

In [86]:
tag_dict = {"PER":[], "ORG":[], "GPE_LOC":[], "GPE_ORG":[], "PROD":[], "LOC":[], "DRV":[], "EVT":[], "MISC":[]}


In [95]:
tag_dict = update_tag_dict(tag_dict, y)

total_NE = 0
for values in tag_dict.values():
    total_NE += sum(values)

In [96]:
total_NE

432

In [104]:
for type, values in tag_dict.items():
    print(f"{type}   {round(sum(values)/432, 4)}")

PER   0.2917
ORG   0.4074
GPE_LOC   0.1574
GPE_ORG   0.0926
PROD   0.0231
LOC   0.0093
DRV   0.0139
EVT   0.0
MISC   0.0046


In [90]:
tag_dict

{'PER': [2,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  1,
  2,
  2,
  1,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  3,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  2,
  3,
  1,
  1,
  1,
  1,
  2,
  3,
  1,
  2,
  1],
 'ORG': [1,
  1,
  1,
  2,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  2],
 'GPE_LOC': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'GPE_ORG': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'PROD': [1, 1, 1, 1, 1],
 'LOC': [1, 1],
 'DRV': [1, 1, 1],
 'EVT': [],
 'MISC': [1]}

In [105]:
flat_list = [item for sublist in y for item in sublist]


In [108]:
len(flat_list) - total_NE

5006

In [121]:
len_count = []
sent_count = 0
for item in y:
    len_count.append(len(item))
    sent_count += 1

avg = round(sum(len_count)/sent_count, 2)
print(avg)

18.13
