In [1]:
import Settings
from model_store import ModelStore
from window_based_tagger_config import get_config
from processessays import process_essays, build_spelling_corrector
from nltk.tokenize import sent_tokenize
from collections import defaultdict
from BrattEssay import Essay, load_bratt_essays

from featureextractortransformer import FeatureExtractorTransformer
from sent_feats_for_stacking import *
from load_data import load_process_essays_without_annotations

from featureextractionfunctions import *
from wordtagginghelper import *

from traceback import format_exc

import logging

def onlyascii(s):
    out = ""
    for char in s:
        if ord(char) > 127:
            out += ""
        else:
            out += char
    return out

In [145]:
class Annotator(object):

    def __init__(self, models_folder, temp_folder, essays_folder):

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        if not models_folder.endswith("/"):
            models_folder += "/"
        if not temp_folder.endswith("/"):
            temp_folder += "/"
        if not essays_folder.endswith("/"):
            essays_folder += "/"

        self.logger = logging.getLogger()
        self.temp_folder = temp_folder
        cfg = get_config(temp_folder)
        self.config = cfg
        self.essays_folder = essays_folder

        # Create spell checker
        # Need annotations here purely to load the tags
        tagged_essays = load_bratt_essays(essays_folder, include_vague=cfg["include_vague"], include_normal=cfg["include_normal"], load_annotations=True)
        self.__set_tags_(tagged_essays)
        self.wd_sent_freq = defaultdict(int)
        self.spelling_corrector = build_spelling_corrector(tagged_essays, self.config["lower_case"], self.wd_sent_freq)

        offset = (self.config["window_size"] - 1) / 2

        unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
        biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)

        extractors = [unigram_window_stemmed, biigram_window_stemmed]

        # most params below exist ONLY for the purposes of the hashing to and from disk
        self.feature_extractor = FeatureExtractorTransformer(extractors)

        # load models
        self.logger.info("Loading pickled models")
        store = ModelStore(models_folder=models_folder)

        self.feature_transformer =  store.get_transformer()
        self.logger.info("Loaded Transformer")
        self.tag_2_wd_classifier = store.get_tag_2_wd_classifier()
        self.logger.info("Loaded word tagging model")
        self.tag_2_sent_classifier = store.get_tag_2_sent_classifier()
        self.logger.info("Loaded sentence classifier")

    def __set_tags_(self, tagged_essays):

        MIN_TAG_FREQ = 5

        tag_freq = defaultdict(int)
        for essay in tagged_essays:
            for sentence in essay.tagged_sentences:
                un_tags = set()
                for word, tags in sentence:
                    for tag in tags:
                        if "5b" in tag:
                            continue
                        if      (tag[-1].isdigit() or tag in {"Causer", "explicit", "Result"} \
                                    or tag.startswith("Causer") or tag.startswith("Result") \
                                    or tag.startswith("explicit") or "->" in tag) \
                                and not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            # if not ("Anaphor" in tag or "rhetorical" in tag or "other" in tag):
                            un_tags.add(tag)
                for tag in un_tags:
                    tag_freq[tag] += 1

        all_tags = list(tag_freq.keys())
        freq_tags = list(set((tag for tag, freq in tag_freq.items() if freq >= MIN_TAG_FREQ)))
        non_causal = [t for t in freq_tags if "->" not in t]
        only_causal = [t for t in freq_tags if "->" in t]

        CAUSE_TAGS = ["Causer", "Result", "explicit"]
        CAUSAL_REL_TAGS = [CAUSAL_REL, CAUSE_RESULT, RESULT_REL]  # + ["explicit"]

        """ works best with all the pair-wise causal relation codes """
        # Include all tags for the output
        self.wd_test_tags = list(set(all_tags + CAUSE_TAGS))

        # tags from tagging model used to train the stacked model
        self.sent_input_feat_tags = list(set(freq_tags + CAUSE_TAGS))
        # find interactions between these predicted tags from the word tagger to feed to the sentence tagger
        self.sent_input_interaction_tags = list(set(non_causal + CAUSE_TAGS))
        # tags to train (as output) for the sentence based classifier
        self.sent_output_train_test_tags = list(set(all_tags + CAUSE_TAGS + CAUSAL_REL_TAGS))

    def annotate(self, essay_text):

        try:
            sentences = sent_tokenize(essay_text.strip())
            contents = "\n".join(sentences)

            fname = self.temp_folder + "essay.txt"
            with open(fname, 'w"') as f:
                f.write(contents)

            essay = Essay(fname, include_vague=self.config["include_vague"],
                          include_normal=self.config["include_normal"], load_annotations=False)

            processed_essays = process_essays(essays=[essay],
                                              spelling_corrector=self.spelling_corrector,
                                              wd_sent_freq=self.wd_sent_freq,
                                              remove_infrequent=self.config["remove_infrequent"],
                                              spelling_correct=self.config["spelling_correct"],
                                              replace_nums=self.config["replace_nums"],
                                              stem=self.config["stem"],
                                              remove_stop_words=self.config["remove_stop_words"],
                                              remove_punctuation=self.config["remove_punctuation"],
                                              lower_case=self.config["lower_case"])

            self.logger.info("Essay loaded successfully")
            essays_TD = self.feature_extractor.transform(processed_essays)

            wd_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD)
            xs = self.feature_transformer.transform(wd_feats)
            wd_predictions_by_code = test_classifier_per_code(xs, self.tag_2_wd_classifier, self.wd_test_tags)

            dummy_wd_td_ys_bytag = defaultdict(lambda: np.asarray([0.0] * xs.shape[0]))
            sent_xs, sent_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(self.sent_input_feat_tags,
                                                                                             self.sent_input_interaction_tags,
                                                                                             essays_TD, xs,
                                                                                             dummy_wd_td_ys_bytag,
                                                                                             self.tag_2_wd_classifier,
                                                                                             sparse=True,
                                                                                             look_back=0)

            """ Test Stack Classifier """
            sent_predictions_by_code = test_classifier_per_code(sent_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags)

            return {"tagged_words":      self.__get_tagged_words_(essay, essays_TD[0], wd_predictions_by_code),
                    "tagged_sentences" : self.__get_tagged_sentences_(essay, sent_predictions_by_code)}
        except Exception as x:
            self.logger.exception("An exception occured while annotating essay")
            return {"error": format_exc()}
        pass

    def __is_tag_to_return_(self, tag):
        return tag[0].isdigit() or ("->" in tag and "Causer" in tag)

    def __friendly_tag_(self, tag):
        return tag.replace("Causer:", "").replace("Result:", "")

    def __get_regular_tags_(self, pred_tags):
        r_tags = sorted(filter(lambda t: t[0].isdigit() and "->" not in t, pred_tags),
                        key=lambda s: (int(s), s) if s.isdigit() else ((-1, s)))
        str_r_tags = ",".join(r_tags)
        return str_r_tags

    def __get_causal_tags_(self, pred_tags):
        c_tags = sorted(filter(lambda t: "->" in t, pred_tags), key=lambda s: int(s.split("->")[0]))
        str_c_tags = ",".join(c_tags)
        return str_c_tags

    def __get_tagged_sentences_(self, essay, sent_predictions_by_code):
        tagged_sents = []
        for i, sent in enumerate(essay.tagged_sentences):
            wds, _ = zip(*sent)
            str_sent = " ".join(wds)
            pred_tags = set()
            for tag, array in sent_predictions_by_code.items():
                if self.__is_tag_to_return_(tag):
                    if np.max(array) == 1:
                        pred_tags.add(self.__friendly_tag_(tag))

            str_r_tags = self.__get_regular_tags_(pred_tags)
            str_c_tags = self.__get_causal_tags_(pred_tags)

            tagged_sents.append((str_sent, str_r_tags, str_c_tags ))
        return tagged_sents

    def __fuzzy_match_(self, original, feat_wd):
        original = original.lower().strip()
        feat_wd = feat_wd.lower().strip()
        if original == feat_wd:
            return True
        if original[:3] == feat_wd[:3]:
            return True
        a = set(original)
        b = set(feat_wd)
        jaccard = float(len(a.intersection(b))) / float(len(a.union(b)))
        return jaccard >= 0.5

    def __align_wd_tags_(self, orig, feats):
        """
        Once processed, there may be a different number of words than in the original sentence
        Try and recover the tags for the original words by aligning the two using simple heuristics
        """
        if len(orig) < len(feats):
            raise Exception("align_wd_tags() : Original sentence is longer!")

        o_wds, _ = zip(*orig)
        feat_wds, new_tags = zip(*feats)

        if len(orig) == len(feats):
            return zip(o_wds, new_tags)

        #here orig is longer than feats
        diff = len(orig) - len(feats)
        tagged_wds = []
        feat_offset = 0
        while len(tagged_wds) < len(o_wds):
            i = len(tagged_wds)
            orig_wd = o_wds[i]
            print i, orig_wd

            if i >= len(feats):
                tagged_wds.append((orig_wd, new_tags[-1]))
                continue
            else:
                new_tag_ix = i - feat_offset
                feat_wd = feats[new_tag_ix][0]
                if feat_wd == "INFREQUENT" or feat_wd.isdigit():
                    tagged_wds.append((orig_wd, new_tags[new_tag_ix]))
                    continue

                new_tagged_wds = []
                found = False
                for j in range(i, i + diff + 1):
                    new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix]))
                    next_orig_wd = o_wds[j]
                    if self.__fuzzy_match_(next_orig_wd, feat_wd):
                        found = True
                        tagged_wds.extend(new_tagged_wds)
                        feat_offset += len(new_tagged_wds) - 1
                        break
                if not found:
                    raise Exception("No matching word found for index:%i and processed word:%s" % (i, feat_wd))
        return tagged_wds

    def __get_tagged_words_(self, original_essay, essay_TD, wd_predictions_by_code):
        tagged_sents = []
        # should be a one to one correspondance between words in essays_TD[0] and predictions
        i = 0
        for sent_ix, sent in enumerate(essay_TD.sentences):
            tmp_tagged_wds = []
            for wix, (feat) in enumerate(sent):
                word = feat.word
                tags = set()
                for tag in wd_predictions_by_code.keys():
                    if wd_predictions_by_code[tag][i] > 0:
                        tags.add(tag)
                i += 1
                tmp_tagged_wds.append((word, tags))

            # Now allign the predicted tags with the original words
            wds, aligned_tags = zip(*self.__align_wd_tags_(original_essay.tagged_sentences[sent_ix], tmp_tagged_wds))
            fr_aligned_tags = map(lambda tags: set(map(self.__friendly_tag_, tags)), aligned_tags)
            tagged_words = zip(wds, fr_aligned_tags)
            tagged_sents.append(map(lambda (wd, tags): (wd, self.__get_regular_tags_(tags), self.__get_causal_tags_(tags)), tagged_words))
        return tagged_sents

In [147]:
import os
cwd = "/Users/simon.hughes/GitHub/NlpResearch/PythonNlpResearch/API"

settings = Settings.Settings()
folder = settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"

annotator = Annotator(models_folder= cwd +"/Models/CB/", temp_folder=cwd+"/temp/", essays_folder=folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/
1154 files found
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_AEKD_4_CB_ES-05571.ann file as .txt file is no essay'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_AEKD_4_CB_ES-05904.ann file as .txt file is no essay'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_BGJD_1_CB_ES-05733.ann file as .txt file is no essay //'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_ERSK_7_CB_ES-05798.ann file as .txt file is no essay //'
Skipping /Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/EBA1415_KYLS_5_CB_ES-05671.ann fi

In [148]:
essay_text = """
Corals are living animals in the ocean.
Corals live in one place and dont really move alot.
Some corals have white on them and that is called "coral bleaching."
Coral Bleaching means that the coral is unhealthy and is trusting into a white color.
Normal water tempatures that the coral live in are 70-80 degrees.
But some of the waters are too cool like 3 to 10 degrees F.
Corals are also affected by storms because corals rely on the amounts of salt in the waters.
So when it storms the water tempatures and levels of salt will be all mest up and bad for the coral.
The storms have to be very extreme to make corals sick or unhealthy.
In the water if the tempature increases the amounts of dioxide will drop and willmake the coral unhealthy.
The water tempatures coral usally build their reefs in are 70-85 degrees F.
So those are the tempature range to keep them healthy.
Corals and zooanthellae algae have a relatioship together.
Most zooanthellae can not live without outside the corals bodies.
It is because there isnt enough nutrience to have the ocean do photosynthesis.
The zooanthellae rely on the coral to stay healthy, but the coral can get physical damage.
Coral bleaching is a physical damage to the corals.
Coral bleaching is also an example how the envionmental stressors can affect the relationships between the coral and the algae. //
"""

In [149]:
d_annotations = d_annotations = annotator.annotate(essay_text)

In [150]:
for sent, r_tags, c_tags in d_annotations["tagged_sentences"]:
    print "\"" + sent+ "\"", r_tags, c_tags

"Corals are living animals in the ocean ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"Corals live in one place and dont really move alot ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"Some corals have white on them and that is called " coral bleaching . "" 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"Coral Bleaching means that the coral is unhealthy and is trusting into a white color ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"Normal water tempatures that the coral live in are 70 - 80 degrees ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"But some of the waters are too cool like 3 to 10 degrees F ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"Corals are also affected by storms because corals rely on the amounts of salt in the waters ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"So when it storms the water tempatures and levels of salt will be all mest up and bad for the coral ." 3,4,11,13,50 3->4,11->14,11->3,11->50,11->13
"The storms have to be very extreme to make c

In [151]:
for sent in d_annotations["tagged_words"]:
    for wd, r_tags, c_tags in sent:
        print str((wd, r_tags, c_tags))
    print ""

('Corals', '', '')
('are', '', '')
('living', '', '')
('animals', '', '')
('in', '', '')
('the', '', '')
('ocean', '', '')
('.', '', '')

('Corals', '', '')
('live', '', '')
('in', '', '')
('one', '', '')
('place', '', '')
('and', '', '')
('dont', '', '')
('really', '', '')
('move', '', '')
('alot', '', '')
('.', '', '')

('Some', '50', '')
('corals', '50', '')
('have', '50', '')
('white', '50', '')
('on', '', '')
('them', '', '')
('and', '', '')
('that', '', '')
('is', '50', '')
('called', '', '')
('"', '', '')
('coral', '50', '')
('bleaching', '50', '')
('.', '', '')
('"', '', '')

('Coral', '50', '')
('Bleaching', '50', '')
('means', '', '')
('that', '', '')
('the', '', '')
('coral', '', '')
('is', '', '')
('unhealthy', '', '')
('and', '', '')
('is', '', '')
('trusting', '50', '')
('into', '50', '')
('a', '50', '')
('white', '50', '')
('color', '50', '')
('.', '', '')

('Normal', '', '')
('water', '', '')
('tempatures', '', '')
('that', '', '')
('the', '', '')
('coral', '', '')
('li

In [5]:
self = annotator
# expects a new line per sentence
sentences = sent_tokenize(essay_text.strip())
contents = "\n".join(sentences)

fname = self.temp_folder + "essay.txt"
with open(fname, 'w"') as f:
    f.write(contents)

essay = Essay(fname, include_vague=self.config["include_vague"], include_normal=self.config["include_normal"], load_annotations=False)
processed_essays = process_essays(essays=[essay],
                                  spelling_corrector=self.spelling_corrector,
                                  wd_sent_freq=self.wd_sent_freq,
                                  remove_infrequent=self.config["remove_infrequent"],
                                  spelling_correct=self.config["spelling_correct"],
                                  replace_nums=self.config["replace_nums"],
                                  stem=self.config["stem"],
                                  remove_stop_words=self.config["remove_stop_words"],
                                  remove_punctuation=self.config["remove_punctuation"],
                                  lower_case=self.config["lower_case"])

self.logger.info("Essay loaded successfully")
essays_TD = self.feature_extractor.transform(processed_essays)

td_feats, _ = flatten_to_wordlevel_feat_tags(essays_TD)
td_X = self.feature_transformer.transform(td_feats)
td_wd_predictions_by_code = test_classifier_per_code(td_X, self.tag_2_wd_classifier, self.wd_test_tags)

dummy_wd_td_ys_bytag = defaultdict(lambda: np.asarray([0.0] * td_X.shape[0]))
sent_td_xs, sent_td_ys_bycode = get_sent_feature_for_stacking_from_tagging_model(self.sent_input_feat_tags,
                                                                                 self.sent_input_interaction_tags,
                                                                                 essays_TD, td_X,
                                                                                 dummy_wd_td_ys_bytag,
                                                                                 self.tag_2_wd_classifier,
                                                                                 sparse=True,
                                                                                 look_back=0)

""" Test Stack Classifier """
td_sent_predictions_by_code \
    = test_classifier_per_code(sent_td_xs, self.tag_2_sent_classifier, self.sent_output_train_test_tags)

In [114]:
",".join([])

''

In [121]:
tagged_sents = []
for i, sent in enumerate(essay.tagged_sentences):
    wds, _ = zip(*sent)
    str_sent = " ".join(wds)
    pred_tags = set()
    for tag, array in td_sent_predictions_by_code.items():
        if tag[0].isdigit() or ("->" in tag and "Causer" in tag):
            if np.max(array) == 1:
                pred_tags.add(tag.replace("Causer:","").replace("Result:",""))
    r_tags = sorted(filter(lambda t: "->" not in t, pred_tags), key=lambda s: (int(s),s) if s.isdigit() else ((-1,s)))
    c_tags = sorted(filter(lambda t: "->" in t, pred_tags), key = lambda s: int(s.split("->")[0]))
    s_tags = ",".join(r_tags) + "," + ",".join(c_tags)
    tagged_sents.append((str_sent, ",".join(r_tags), ",".join(c_tags) ))
tagged_sents

[('Corals are living animals in the ocean .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('Corals live in one place and dont really move alot .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('Some corals have white on them and that is called " coral bleaching . "',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('Coral Bleaching means that the coral is unhealthy and is trusting into a white color .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('Normal water tempatures that the coral live in are 70 - 80 degrees .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('But some of the waters are too cool like 3 to 10 degrees F .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('Corals are also affected by storms because corals rely on the amounts of salt in the waters .',
  '3,4,11,13,50',
  '3->4,11->14,11->3,11->50,11->13'),
 ('So when it storms the water tempatures and levels of salt will be all mest up and bad for the

In [8]:
td_wd_predictions_by_code["5"].shape

(262,)

In [9]:
print sum([len(sent) for sent in essay.tagged_sentences])
print sum([len(sent) for sent in processed_essays[0].sentences])
print sum([len(sent) for sent in essays_TD[0].sentences])

262
262
262


In [10]:
for ix, sent in enumerate(processed_essays[0].sentences):
    orig = essay.tagged_sentences[ix]
    orig_sent = " ".join(zip(*orig)[0])
    proc_sent = " ".join(zip(*sent)[0])
    print orig_sent
    print proc_sent
    print ""

Corals are living animals in the ocean .
Corals are living animals in the ocean .

Corals live in one place and dont really move alot .
Corals live in one place and dont really move alot .

Some corals have white on them and that is called " coral bleaching . "
Some corals have white on them and that is called " coral bleaching . "

Coral Bleaching means that the coral is unhealthy and is trusting into a white color .
Coral Bleaching means that the coral is unhealthy and is INFREQUENT into a white color .

Normal water tempatures that the coral live in are 70 - 80 degrees .
Normal water temperatures that the coral live in are 00 - 00 degrees .

But some of the waters are too cool like 3 to 10 degrees F .
But some of the waters are too cool like 0 to 00 degrees F .

Corals are also affected by storms because corals rely on the amounts of salt in the waters .
Corals are also affected by storms because corals rely on the amounts of salt in the waters .

So when it storms the water tempatu

In [30]:
tagged_wd_sents = []
#should be a one to one correspondance between words in essays_TD[0] and predictions
i = 0
for sent in essays_TD[0].sentences:
    tagged_wds = []
    for wix, (feat) in enumerate(sent):
        word = feat.word
        tags = set()
        for tag in td_wd_predictions_by_code.keys():
            if td_wd_predictions_by_code[tag][i] > 0:
                tags.add(tag)
        i+=1
        tagged_wds.append((word, tags))
    tagged_wd_sents.append(tagged_wds)
tagged_wd_sents

[[('Corals', set()),
  ('are', set()),
  ('living', set()),
  ('animals', set()),
  ('in', set()),
  ('the', set()),
  ('ocean', set()),
  ('.', set())],
 [('Corals', set()),
  ('live', set()),
  ('in', set()),
  ('one', set()),
  ('place', set()),
  ('and', set()),
  ('dont', set()),
  ('really', set()),
  ('move', set()),
  ('alot', set()),
  ('.', set())],
 [('Some', {'50'}),
  ('corals', {'50'}),
  ('have', {'50'}),
  ('white', {'50'}),
  ('on', set()),
  ('them', set()),
  ('and', set()),
  ('that', set()),
  ('is', {'50'}),
  ('called', set()),
  ('"', set()),
  ('coral', {'50'}),
  ('bleaching', {'50'}),
  ('.', set()),
  ('"', set())],
 [('Coral', {'50'}),
  ('Bleaching', {'50'}),
  ('means', set()),
  ('that', set()),
  ('the', set()),
  ('coral', set()),
  ('is', set()),
  ('unhealthy', set()),
  ('and', set()),
  ('is', set()),
  ('INFREQUENT', {'50'}),
  ('into', {'50'}),
  ('a', {'50'}),
  ('white', {'50'}),
  ('color', {'50'}),
  ('.', set())],
 [('Normal', set()),
  ('wa

In [103]:
#Once processed, there may be a different number of words than in the original sentence
#Try and recover the tags for the original words by aligning the two using simple heuristics
def fuzzy_match(original, feat_wd):
    original = original.lower().strip()
    feat_wd = feat_wd.lower().strip()
    if original == feat_wd:
        #print "\nMatch"
        return True
    if orig[:3] == feat_wd[:3]:
        #print "\n", orig[:3] , feat_wd[:3]
        return True
    a = set(original)
    b = set(feat_wd)
    jaccard = float(len(a.intersection(b))) / float(len(a.union(b)))
    #print "\nJaccard"
    return jaccard >= 0.5

def align_wd_tags(orig, feats):
    if len(orig) < len(feats):
        raise Exception("align_wd_tags() : Original sentence is longer!")

    o_wds,    _        = zip(*orig)
    feat_wds, new_tags = zip(*feats)

    if len(orig) == len(feats):
        return zip(o_wds, new_tags)

    #here orig is longer than feats
    diff = len(orig) - len(feats)
    tagged_wds = []
    feat_offset = 0
    while len(tagged_wds) < len(o_wds):
        i = len(tagged_wds)
        orig_wd = o_wds[i]
        print i, orig_wd

        if i >= len(feats):
            tagged_wds.append((orig_wd, new_tags[-1]))
            continue
        else:
            new_tag_ix = i - feat_offset
            feat_wd = feats[new_tag_ix][0]
            if feat_wd == "INFREQUENT" or feat_wd.isdigit():
                tagged_wds.append((orig_wd, new_tags[new_tag_ix]))
                continue

            new_tagged_wds = []
            found = False
            for j in range(i, i+diff+1):
                new_tagged_wds.append((o_wds[j], new_tags[new_tag_ix]))
                next_orig_wd = o_wds[j]
                if fuzzy_match(next_orig_wd, feat_wd):
                    found = True
                    tagged_wds.extend(new_tagged_wds)
                    feat_offset += len(new_tagged_wds) -1
                    break
            if not found:
                raise Exception("No matching word found for index:%i and processed word:%s" % (i, feat_wd))
    return tagged_wds

def test(a,b):
    print a, b, "\t", fuzzy_match(a,b)

In [101]:
test("frog", "frog")
test("frog", "froXXXX")
test("frogggy", "fr")
test("123456", "645")
test("123456", "aaa645")
test("123456xyz", "abc645231")

frog frog 	True
frog froXXXX 	True
frogggy fr 	False
123456 645 	True
123456 aaa645 	False
123456xyz abc645231 	True


In [105]:
def print_lst(tag_l):
    print "["
    for wd in zip(*tag_l)[0]:
        print "'%s'," % wd
    print "]"

orig = map(lambda wd: (wd, set()), "The plankton was causing the coral to be bleached".split(" "))
print_lst(orig)

[
'The',
'plankton',
'was',
'causing',
'the',
'coral',
'to',
'be',
'bleached',
]


In [106]:
tagged = [
#'The',
'INFREQUENT',
'was',
'causing',
#'the',
'coral',
'to',
'be',
'bleached',
]
tagged = map(lambda w: (w,set([w])), tagged)
aligned = align_wd_tags(orig, tagged)
aligned

0 The
1 plankton
3 causing
4 the
6 to
7 be
8 bleached


[('The', {'INFREQUENT'}),
 ('plankton', {'was'}),
 ('was', {'was'}),
 ('causing', {'causing'}),
 ('the', {'coral'}),
 ('coral', {'coral'}),
 ('to', {'to'}),
 ('be', {'bleached'}),
 ('bleached', {'bleached'})]