In [1]:
import conllu
import pandas as pd
from collections import Counter

In [2]:
## Conllu stuff
def filter_tags(x):
    return x        

def convert_to_list_dict(path, file):
    path = path.format(file)
    with open(path, encoding="UTF-8") as infile:
        lst = []
        tokens = list(conllu.parse_incr(infile))
        for sent in tokens:
                dic = {
                "idx": sent.metadata["sent_id"],
                "text": sent.metadata["text"].lower(),
                "tokens": [token["form"].lower() for token in sent],
                "lemmas": [token["lemma"] for token in sent],
                "pos_tags": [token["upos"] for token in sent],
                "ner_tags": [filter_tags(token["misc"].get("name", "O")) for token in sent],
            }
                lst.append(dic) 
        print("Converting {} to list of dictionaries\n     {} elements converted..".format(file, len(lst)))
    return lst

In [3]:
def get_stats_norne(split):
    token_count = 0
    ner_labels = {}
    ner_labels_proportion = {}

    for item in split:
        token_count += len(item["tokens"])
        counter = Counter(item["ner_tags"])
        for k, v in counter.items():
            if k not in ner_labels:
                ner_labels[k] = v
            else:
                ner_labels[k] = ner_labels[k] + v

    total_NE = 0
    for key, value in ner_labels.items():
        if key != "O":
            total_NE += value

    stats = {
        "Total token count": token_count,
        "Sentence count": len(split),  
        "Average sentence length": token_count/len(split),
        "Total NE": total_NE
    }

    for k, v in ner_labels.items():
        if k != "O":
            if k not in ner_labels_proportion:
                    ner_labels_proportion[k] = ner_labels[k] / total_NE
                    
        
    stats_dataframe = pd.DataFrame([stats])
    ner_labels_dataframe = pd.DataFrame([ner_labels])
    ner_labels_proportion_dataframe = pd.DataFrame([ner_labels_proportion])

    return stats_dataframe, ner_labels_dataframe, ner_labels_proportion_dataframe

In [4]:
path = "MSTR-PY/Corpora/all_conllu/{}.conllu"
file_list = ["no_bokmaal-ud-dev", "no_bokmaal-ud-test", "no_bokmaal-ud-train", "no_nynorsk-ud-dev", "no_nynorsk-ud-test", "no_nynorsk-ud-train"]

dev_split_no = convert_to_list_dict(path, file_list[0])
test_split_no = convert_to_list_dict(path, file_list[1])
train_split_no = convert_to_list_dict(path, file_list[2])

dev_split_ny = convert_to_list_dict(path, file_list[3])
test_split_ny = convert_to_list_dict(path, file_list[4])
train_split_ny = convert_to_list_dict(path, file_list[5])

print("Combining train, dev and test sets..")
dev_split = dev_split_no + dev_split_ny
test_split = test_split_no + test_split_ny
train_split = train_split_no + train_split_ny
print("Success!")

Converting no_bokmaal-ud-dev to list of dictionaries
     2410 elements converted..
Converting no_bokmaal-ud-test to list of dictionaries
     1939 elements converted..
Converting no_bokmaal-ud-train to list of dictionaries
     15696 elements converted..
Converting no_nynorsk-ud-dev to list of dictionaries
     1890 elements converted..
Converting no_nynorsk-ud-test to list of dictionaries
     1511 elements converted..
Converting no_nynorsk-ud-train to list of dictionaries
     14174 elements converted..
Combining train, dev and test sets..
Success!


In [5]:
dev_stats_dataframe, dev_ner_labels_dataframe, dev_ner_labels_proportion_dataframe = get_stats_norne(dev_split)
test_stats_dataframe, test_ner_labels_dataframe, test_ner_labels_proportion_dataframe = get_stats_norne(test_split)
train_stats_dataframe, train_ner_labels_dataframe, train_ner_labels_proportion_dataframe = get_stats_norne(train_split)

## DEV

In [8]:
dev_stats_dataframe

Unnamed: 0,Total token count,Sentence count,Average sentence length,Total NE
0,67619,4300,15.725349,4128


In [9]:
dev_ner_labels_dataframe

Unnamed: 0,O,B-PER,I-PER,B-PROD,B-GPE_LOC,I-GPE_LOC,B-ORG,B-DRV,I-ORG,B-GPE_ORG,I-PROD,B-LOC,I-LOC,B-EVT,I-DRV,I-GPE_ORG,I-EVT
0,63491,1092,549,248,454,76,688,129,182,121,259,194,56,16,46,11,7


In [10]:
dev_ner_labels_proportion_dataframe

Unnamed: 0,B-PER,I-PER,B-PROD,B-GPE_LOC,I-GPE_LOC,B-ORG,B-DRV,I-ORG,B-GPE_ORG,I-PROD,B-LOC,I-LOC,B-EVT,I-DRV,I-GPE_ORG,I-EVT
0,0.264535,0.132994,0.060078,0.109981,0.018411,0.166667,0.03125,0.044089,0.029312,0.062742,0.046996,0.013566,0.003876,0.011143,0.002665,0.001696


# TEST

In [11]:
test_stats_dataframe

Unnamed: 0,Total token count,Sentence count,Average sentence length,Total NE
0,54739,3450,15.866377,3468


In [12]:
test_ner_labels_dataframe

Unnamed: 0,O,B-PER,I-PER,B-PROD,I-PROD,B-GPE_LOC,I-GPE_LOC,B-GPE_ORG,B-ORG,I-ORG,B-DRV,B-LOC,I-LOC,B-EVT,I-EVT,I-DRV,I-GPE_ORG,B-MISC,I-MISC
0,51271,961,510,131,126,428,79,61,521,248,78,185,85,14,4,13,7,14,3


In [13]:
test_ner_labels_proportion_dataframe

Unnamed: 0,B-PER,I-PER,B-PROD,I-PROD,B-GPE_LOC,I-GPE_LOC,B-GPE_ORG,B-ORG,I-ORG,B-DRV,B-LOC,I-LOC,B-EVT,I-EVT,I-DRV,I-GPE_ORG,B-MISC,I-MISC
0,0.277105,0.147059,0.037774,0.036332,0.123414,0.02278,0.017589,0.150231,0.071511,0.022491,0.053345,0.02451,0.004037,0.001153,0.003749,0.002018,0.004037,0.000865


# TRAIN

In [14]:
train_stats_dataframe

Unnamed: 0,Total token count,Sentence count,Average sentence length,Total NE
0,489217,29870,16.378206,32625


In [15]:
train_ner_labels_dataframe

Unnamed: 0,O,B-PROD,I-PROD,B-LOC,B-PER,I-PER,B-GPE_ORG,B-ORG,B-DRV,I-ORG,B-GPE_LOC,I-DRV,B-MISC,I-GPE_LOC,I-LOC,B-EVT,I-EVT,I-GPE_ORG,I-MISC
0,456592,1404,1417,1511,8320,4912,756,5601,969,2128,4222,169,14,277,424,274,157,66,4


In [16]:
train_ner_labels_proportion_dataframe

Unnamed: 0,B-PROD,I-PROD,B-LOC,B-PER,I-PER,B-GPE_ORG,B-ORG,B-DRV,I-ORG,B-GPE_LOC,I-DRV,B-MISC,I-GPE_LOC,I-LOC,B-EVT,I-EVT,I-GPE_ORG,I-MISC
0,0.043034,0.043433,0.046314,0.255019,0.150559,0.023172,0.171678,0.029701,0.065226,0.12941,0.00518,0.000429,0.00849,0.012996,0.008398,0.004812,0.002023,0.000123
