In [8]:
!ls "/Users/simon.hughes/data/tensorflow/syntaxnet/tagger"

[1m[36mcb[m[m [1m[36msc[m[m


In [1]:
target_folder = "Training"
#target_folder = "Test"

# From (essay text) and To Files for the seq2seq model
training_file           = "/Users/simon.hughes/data/tensorflow/syntaxnet/tagger/cb/training.cnll"
tuning_file             = "/Users/simon.hughes/data/tensorflow/syntaxnet/tagger/cb/tuning.cnll"
test_file               = "/Users/simon.hughes/data/tensorflow/syntaxnet/tagger/cb/test.cnll"

In [2]:
from BrattEssay import load_bratt_essays
from collections import defaultdict
from IterableFP import flatten
from Settings import Settings

settings = Settings()
root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
training_folder = root_folder + target_folder + "/"
test_folder = root_folder + "Test" + "/"

essays = load_bratt_essays(training_folder)
test_essays = load_bratt_essays(test_folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
(902, 'files found')
902 essays processed
(226, 'files found')
226 essays processed


In [3]:
from collections import defaultdict

def process_essays(essays):
    sentences, sent_tags = [], []
    wd_freq, tag_freq = defaultdict(int), defaultdict(int)
    for essay_ix, essay in enumerate(essays):
        for sent_ix, taggged_sentence in enumerate(essay.tagged_sentences):
            t_sentence = []
            sentences.append(t_sentence)
            t_tag_seq = []
            sent_tags.append(t_tag_seq)
            for word_ix, (wd, t) in enumerate(taggged_sentence):
                t_sentence.append(wd)
                t_tag_seq.append(t)
                wd_freq[wd]+=1
                for tag in t:
                    tag_freq[tag] +=1
    return sentences, sent_tags, wd_freq, tag_freq
sentences, sent_tags, wd_freq, tag_freq = process_essays(essays)
test_sentences, test_tags, _, _ = process_essays(test_essays)

In [4]:
regular_tags = set((t for t in tag_freq.keys() if t[0].isdigit()))
regular_tags

{'1', '11', '12', '13', '14', '2', '3', '4', '5', '50', '5b', '6', '7'}

In [5]:
def to_reg_tags(sent_tags):
    regular_tag_seq = []
    for tag_seq in sent_tags:
        r_tag_seq = []
        regular_tag_seq.append(r_tag_seq)
        # for each set of tags in sent
        for tag_set in tag_seq:
            tag_set = set((t for t in tag_set if t in regular_tags))
            r_tag_seq.append(tag_set)
    return regular_tag_seq
regular_tag_seq = to_reg_tags(sent_tags)
test_regular_tag_seq = to_reg_tags(test_tags)

In [6]:
def to_most_freq_tags(regular_tag_seq):
    most_freq_tags = []
    None_Tag = "None"
    # for each sentence
    for tag_seq in regular_tag_seq:
        most_freq = []
        most_freq_tags.append(most_freq)
        # for each set of tags in sent
        for tag_set in tag_seq:
            tag_set = set((t for t in tag_set if t in regular_tags))
            if len(tag_set) == 0:
                most_freq.append(None_Tag)
            else:
                tag = max(tag_set, key = lambda t:tag_freq[t])
                most_freq.append(tag)
    return most_freq_tags

most_freq_tags = to_most_freq_tags(regular_tag_seq)
test_most_freq_tags = to_most_freq_tags(test_regular_tag_seq)

In [7]:
len(sentences), len(most_freq_tags), len(regular_tag_seq)

(8292, 8292, 8292)

In [8]:
for ix, (sent, tags, mfreq_tags) in enumerate(zip(sentences, most_freq_tags, regular_tag_seq)):
    assert len(sent) == len(tags) == len(mfreq_tags), "Lengths differ at %i" % ix
for ix, (sent, tags, mfreq_tags) in enumerate(zip(test_sentences, test_most_freq_tags, test_regular_tag_seq)):
    assert len(sent) == len(tags) == len(mfreq_tags), "Lengths differ at %i" % ix

In [21]:
def write_to_file(fname, sentences, tags):
    with open(fname, "w+") as f:
        for sent, tag_seq in zip(sentences,tags):
            i = 1
            for wd, t in zip(sent, tag_seq):
                if not t:
                    tag = "None"
                else:
                    tag = "Code_" + t
                #tag = "NOUN"
                # see here for CONLL format (needs at least 8 tab sep cols), hyphen for unknown
                #    http://ufal.mff.cuni.cz/conll2009-st/task-description.html
                # see here for explanation of coarse and fine tags: 
                #    http://mwetoolkit.sourceforge.net/PHITE.php?sitesig=MWE&page=MWE_070_File_types&subpage=MWE_010_CONLL
                # Column 4 below is coarse POS tag, column 5 is fine grained
                f.write("{id}\t{word}\t{dash}\t{dash}\t{pos}\t{dash}\t{dash}\t{dash}\t{dash}\t{dash}\n".\
                        format(id=i, word=wd, dash="_", pos=tag))
                i+=1
            f.write("\n")

In [22]:
test_split = 0.2
num_test = int(0.2 * len(sentences))
num_train = len(sentences) - num_test
train_sent, test_sent = sentences[:num_train], sentences[num_train:]
train_sent, test_sent = sentences[:num_train], sentences[num_train:]
train_tags, test_tags = most_freq_tags[:num_train], most_freq_tags[num_train:]

In [23]:
write_to_file(training_file, train_sent, train_tags)
write_to_file(tuning_file,   test_sent,  test_tags)
write_to_file(test_file,     test_sentences, test_most_freq_tags)