In [1]:
# Import libraries and define constants

import csv
import os

data_file = './Data/ner_dataset.csv'

In [2]:
# Reading CSV file

def read_csv(file_name):
    """
    Function output should be of this format-
    [John, lives, in, New, York] [B-PER, O, O, B-LOC, I-LOC]
    """
    with open(file_name, 'r', encoding = 'ISO-8859-1') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)
        
        data = []
        word_list, tag_list = [], []
        
        for line in csv_reader:
            sentence_number, word, pos, tag = line
            
            # Non-empty sentence shows that new sentence is started
            if len(sentence_number) != 0:
                if len(word_list) != 0:
                    data.append((word_list, tag_list))
                    word_list, tag_list = [], []
            
            word_list.append(str(word))
            tag_list.append(str(tag))

    return data

In [3]:
# Save new dataset files

def save_dataset_files(new_data, new_data_path):
    # Create file paths
    sentence_path = os.path.join(new_data_path, 'sentences.txt')
    label_path = os.path.join(new_data_path, 'labels.txt')
    
    # Check stale file
    if os.path.exists(sentence_path):
        os.remove(sentence_path)
    if os.path.exists(label_path):
        os.remove(label_path)
    
    # Write files
    with open(sentence_path, 'w') as file_sentences:
        with open(label_path, 'w') as file_labels:
            for words, labels in new_data:
                file_sentences.write("{}\n".format(" ".join(words)))
                file_labels.write("{}\n".format(" ".join(labels)))
    
    print('Saved files: {} & {}'.format(sentence_path, label_path))

In [4]:
# Main Data-Pipeline

print('\nStart: reading file')
data = read_csv(data_file)
size_of_data = len(data)
print('Data size (Lines): ', size_of_data)
print('Sample data: \n', data[0])
print('End: reading file')

print('\nSplitting data into train, validation & test data')
train_data = data[:int(0.7 * size_of_data)]
validation_data = data[int(0.7 * size_of_data):int(0.85 * size_of_data)]
test_data = data[int(0.85 * size_of_data):]

print('\nStart: saving new data files')
save_dataset_files(train_data, './Data/Train')
save_dataset_files(validation_data, './Data/Validation')
save_dataset_files(test_data, './Data/Test')
print('End: saving new data files')


Start: reading file
Data size (Lines):  47958
Sample data: 
 (['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'])
End: reading file

Splitting data into train, validation & test data

Start: saving new data files
Saved files: ./Data/Train/sentences.txt & ./Data/Train/labels.txt
Saved files: ./Data/Validation/sentences.txt & ./Data/Validation/labels.txt
Saved files: ./Data/Test/sentences.txt & ./Data/Test/labels.txt
End: saving new data files
