In [12]:
from itertools import chain

In [13]:
def read_conll_file(file_path):
    sentences = []
    sentence = []
    tags = []
    tag = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if len(line) == 0:  # Empty line indicates the end of a sentence
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag)
                sentence = []
                tag = []
            else:
                parts = line.split()
                word = parts[0]
                if word == '-DOCSTART-':
                    continue
                ner_tag = parts[-1]
                sentence.append(word)
                tag.append(ner_tag)
        if sentence:
            sentences.append(sentence)
            tags.append(tag)
    return sentences, tags

In [14]:
def get_unique_tags(allTags):
    uniquetags = set()
    flatten_sentences = list(chain(*allTags))
    for i in flatten_sentences:
        uniquetags.add(i)
    return uniquetags

In [15]:
# reading the data from the files
train_data, train_tags = read_conll_file('CoNLL2003_dataset/eng.train')
val_data, val_tags = read_conll_file('CoNLL2003_dataset/eng.testa')
test_data, test_tags = read_conll_file('CoNLL2003_dataset/eng.testb')

# print the number of sentences in the training, development and test dataset
print(f"The number of sentences in the training data is {len(train_data)}.")
print(f"The number of sentences in the development data is {len(val_data)}.")
print(f"The number of sentences in the test data is {len(test_data)}.")

# getting all the unique tags in the dataset
all_tags = set()
all_tags.update(get_unique_tags(train_tags))
all_tags.update(get_unique_tags(val_tags))
all_tags.update(get_unique_tags(test_tags))

# print all the tags
print(f"The set of NER tags are {all_tags}.")

The number of sentences in the training data is 14041.
The number of sentences in the development data is 3250.
The number of sentences in the test data is 3453.
The set of NER tags are {'O', 'B-ORG', 'I-ORG', 'B-MISC', 'B-LOC', 'I-MISC', 'I-LOC', 'I-PER'}.
