In [1]:
import os
import json
import argparse
from collections import Counter

In [2]:
#parser = argparse.ArgumentParser()
#parser.add_argument('--data_dir', default = 'Data', help = 'Dataset directory')

In [3]:
PAD_WORD = '<pad>'
PAD_LABEL = 'O'
UNKNOWN_WORD = 'UNK'

In [4]:
def update_vocabulary(file_path, vocab):
    with open(file_path) as f:
        for i, line in enumerate(f):
            vocab.update(line.strip().split(' '))
    return i+1

In [5]:
def save_vocabulay_to_text(vocab, file_path):
    with open(file_path, 'w') as f:
        for token in vocab:
            f.write(token + '\n')

In [6]:
def save_dict_to_json(di, file_path):
    with open(file_path, 'w') as f:
        di = {k: v for k, v in di.items()}
        json.dump(di, f, indent = 4)

In [7]:
current_directory = os.getcwd()
print('Building word vocabulary')
words = Counter()
size_train_sentences = update_vocabulary(os.path.join(current_directory, 'Data/Train/sentences.txt'), words)
size_validation_sentences = update_vocabulary(os.path.join(current_directory, 'Data/Validation/sentences.txt'), words)
size_test_sentences = update_vocabulary(os.path.join(current_directory, 'Data/Test/sentences.txt'), words)

Building word vocabulary


In [8]:
print('Building label vocabulary')
labels = Counter()
size_train_labels = update_vocabulary(os.path.join(current_directory, 'Data/Train/labels.txt'), labels)
size_validation_labels = update_vocabulary(os.path.join(current_directory, 'Data/Validation/labels.txt'), labels)
size_test_labels = update_vocabulary(os.path.join(current_directory, 'Data/Test/labels.txt'), labels)

Building label vocabulary


In [9]:
print(size_train_sentences, size_train_labels)
print(size_validation_sentences, size_validation_labels)
print(size_test_sentences, size_test_labels)

33570 33570
7194 7194
7194 7194


In [10]:
# Words looks like-
words

Counter({'Thousands': 114,
         'of': 26354,
         'demonstrators': 110,
         'have': 5485,
         'marched': 65,
         'through': 515,
         'London': 261,
         'to': 23212,
         'protest': 237,
         'the': 52572,
         'war': 720,
         'in': 26323,
         'Iraq': 1738,
         'and': 19936,
         'demand': 220,
         'withdrawal': 154,
         'British': 637,
         'troops': 1195,
         'from': 4539,
         'that': 6301,
         'country': 1925,
         '.': 47761,
         'Families': 6,
         'soldiers': 757,
         'killed': 2861,
         'conflict': 245,
         'joined': 116,
         'protesters': 197,
         'who': 1919,
         'carried': 222,
         'banners': 11,
         'with': 5381,
         'such': 408,
         'slogans': 36,
         'as': 4106,
         '"': 3686,
         'Bush': 976,
         'Number': 1,
         'One': 166,
         'Terrorist': 4,
         'Stop': 3,
         'Bombings': 4,
  

In [11]:
# Labels look like-
labels

Counter({'O': 887901,
         'B-geo': 37644,
         'B-gpe': 15869,
         'B-per': 16990,
         'I-geo': 7414,
         'B-org': 20143,
         'I-org': 16784,
         'B-tim': 20333,
         'B-art': 402,
         'I-art': 297,
         'I-per': 17251,
         'I-gpe': 198,
         'I-tim': 6528,
         'B-nat': 201,
         'B-eve': 308,
         'I-eve': 253,
         'I-nat': 51})

In [12]:
# Applying most frequent constraint

min_word_count = 1
min_label_count = 1

words = [word for word, count in words.items() if count >= min_word_count]
labels = [label for label, count in labels.items() if count >= min_label_count]

In [13]:
# Add padding tokens for NULL values
if PAD_WORD not in words: words.append(PAD_WORD)
if PAD_LABEL not in labels: labels.append(PAD_LABEL) 

In [14]:
words.append(UNKNOWN_WORD)

In [15]:
# Dump all data to files

print('Dumping files')
save_vocabulay_to_text(words, os.path.join(current_directory, 'Data/words.txt'))
save_vocabulay_to_text(labels, os.path.join(current_directory, 'Data/labels.txt'))

Dumping files


In [16]:
# Dataset Parameter

parameter = {
    'train_size': size_train_sentences,
    'validation_size': size_validation_sentences,
    'test_size': size_test_sentences,
    'vocabulary_size': len(words),
    'number_of_tags': len(labels),
    'pad_word': PAD_WORD,
    'pad_tag': PAD_LABEL,
    'unknown_word': UNKNOWN_WORD
}

print('Saving parametes of dataset')
save_dict_to_json(parameter, os.path.join(current_directory, 'dataset_parameters.json'))

Saving parametes of dataset
