In [16]:
import semantic as s
import semantic_rule_set
import syntactic_and_semantic_rules

import copy

In [17]:
#setup
training_sentences_file = 'training.txt'
testing_sentences_file = 'testing.txt'
show_database = False
validate = False
gui = False

with open(training_sentences_file, 'r') as f:
    training_sentences = [x.strip() for x in f]
with open(testing_sentences_file, 'r') as f:
    testing_sentences = [x.strip() for x in f]

sem = semantic_rule_set.SemanticRuleSet()
sem = syntactic_and_semantic_rules.addLexicon(sem)

### Predicting the missing word

Our goal is take a sentence with a missing word -- 
 for example "Mary ate a \_", and replace the blank with a 'reasonable' replacement word.
This task is something that people do everyday, especially in a noisy setting or when speaking to someone with a thick accent. 
However, this task goes beyond just syntactic validity, since few people would guess that the sentence was 'Mary ate a laptop'. We need to inject some notion of semantics. However, we cannot just naively apply methods from WordNet, because we are missing the word that fits in the blank. Most WordNet methods are ways of mapping from one word to other words that are related in a particular way, eg hypernymy or synonymy. 

So in order to move forward, we should first come up with a way of distinguishing valid sentences in a way that lets us generate a missing word. We decided to adopt a model of language learning that is very similar to the notion of near-miss learning. We take a set of training sentences, use software from lab 3 to convert them into event structures, and group event structures together in a way that lets us generalize semantically valid sentences from our training data. This means that we are assuming that the training data is semantically valid. 

Below, we will experiment with different grouping and generalizationg strategies in order to determine semantically valid replacements for a missing word.

### Generate Event Structures
First we generate event structures from sentences, which we store in a list of dictionaries for simplicity.

In [18]:
print training_sentences
events = map(lambda sent: s.sentenceToEventDict(sem, sent), training_sentences)
for e in events:
    print e

['John ate the potato', 'John ate the tomato', 'Mary ate the tomato']
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}


### Simplest Strategy: No Grouping

In [19]:
#Parse each sentence in training data
def train(sem, sentences, groupEvents):
    event_list = []
    for sentence in sentences:
        try:
           new_event_dict = s.sentenceToEventDict(sem, sentence)
           event_list = groupEvents(event_list, new_event_dict)
        except Exception as e:
            # The parser did not return any parse trees.
            raise
    return event_list

def keepSeparate(event_list, new_event_dict):
    return event_list + [new_event_dict]

event_groupings = train(sem, training_sentences, keepSeparate)
for g in event_groupings:
    print g

{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}


### One Difference Groupings
This is a very conservative form of grouping. If two training sentences the same event structure but differ across one feature, we group together the values of that feature.

In [39]:
def groupIfOneDiff(event_list, new_event): #if different structure, do not match
    #maybe only do after reaching a certain size
    new_event_list = copy.deepcopy(event_list)
    merged = False
    #try merging in
    for i in range(len(event_list)): #try to match with event_list[i]
        event = event_list[i]
        if set(event.keys()) == set(new_event.keys()):
            unequal_count = 0
            for feat in event.keys():
                if new_event[feat] not in event[feat]:
                    unequal_feat = feat
                    unequal_count += 1
            if unequal_count == 0: merged = True
            elif unequal_count == 1: #merge into previous
                new_event_list[i][unequal_feat].add(new_event[unequal_feat])
                merged = True
    #make new spot
    if not merged:
        new_event_list.append({k:set([v]) for k,v in new_event.iteritems()})
    return new_event_list

event_groupings = train(sem, training_sentences, groupIfOneDiff)
for e in events:
    print e
for g in event_groupings:
    print g

{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}


Note that the above only produces one output grouping (as opposed to two groupings, composed of sentences (1,2) and (2,3)).
This is because we are applying the groupings iteratively. We loop through the events, and compare the current event with the groupings that we have collected up to that point. The comparison in this case is not checking for equality between values of a common feature, but rather it is checking for inclusion of the current event's feature values within groupings of that feature.
```python
for feat in event.keys():
    if new_event[feat] not in event[feat]:
        unequal_feat = feat
```
This part of the previous method demonstrates this inclusion checking.

This implies that the same training sentences, in different orders, can lead to different event groupings.

In [40]:
def rotate(lst): 
    return [lst[-1]] + lst[:-1]
print training_sentences
print training_sentences[::-1]
event_groupings_1 = train(sem, training_sentences, groupIfOneDiff)
event_groupings_2 = train(sem, rotate(training_sentences), groupIfOneDiff)

print event_groupings_1 #creates 1 group
print event_groupings_2 #creates 2 groups

['John ate the potato', 'John ate the tomato', 'Mary ate the tomato']
['Mary ate the tomato', 'John ate the tomato', 'John ate the potato']
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}]
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato']), 'agent': set(['John', 'Mary'])}, {'action': set(['eat']), 'patient': set(['tomato', 'potato']), 'tense': set(['past']), 'agent': set(['John'])}]


This grouping pattern is related to near-miss learning:

The reason that event_groupings_2 creates 2 instead of 1 grouping is that the 1st and 2nd sentence differ from each other is 2 ways, instead of just 1.
    
But maybe this is a bit too conservative of an assumption. Alternately we could try grouping when seeing two differences.

In [42]:
def groupIfOneOrTwoDiffs(event_list, new_event): #if different structure, do not match
    #maybe only do after reaching a certain size
    new_event_list = copy.deepcopy(event_list)
    merged = False
    #try merging in
    for i in range(len(event_list)): #try to match with event_list[i]
        event = event_list[i]
        if set(event.keys()) == set(new_event.keys()):
            unequal_count = 0
            for feat in event.keys():
                if new_event[feat] not in event[feat]:
                    if unequal_count == 0:
                        unequal_feat_1 = feat
                    if unequal_count == 1:
                        unequal_feat_2 = feat
                    unequal_count += 1
            if unequal_count == 0: merged = True
            elif unequal_count == 1: #merge into previous
                new_event_list[i][unequal_feat_1].add(new_event[unequal_feat_1])
                merged = True
            elif unequal_count == 2: #merge into previous
                new_event_list[i][unequal_feat_1].add(new_event[unequal_feat_1])
                new_event_list[i][unequal_feat_2].add(new_event[unequal_feat_2])
                merged = True
    #make new spot
    if not merged:
        new_event_list.append({k:set([v]) for k,v in new_event.iteritems()})
    return new_event_list

events = map(lambda sent: s.sentenceToEventDict(sem, sent), rotate(training_sentences))
for e in events:
    print e
event_groupings = train(sem, rotate(training_sentences), groupIfOneOrTwoDiffs)
print event_groupings

{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}]


### Filling In the Word Blank
Now that we have come up with some event grouping strategies, we return to the original goal of our project -- to fill in the missing word.

The reason for the groupings above, is that we would like to take training event structures like:
```python
['John ate the potato', 'John ate the tomato', 'Mary ate the tomato']
```
And conclude that:
```python
['Mary ate the potato']
```
is a valid sentence.

In [46]:
def checkGoodSentence(sem, sentence, event_list):
    event = s.sentenceToEventDict(sem, sentence)
    if not event: return False
    for event_group in event_list:
        if set(event.keys()) == set(event_group.keys()):
            if all([event[k] in event_group[k] for k in event.keys()]):
                return True
    return False

sentence = 'Mary ate the potato'
event_groupings = train(sem, training_sentences, groupIfOneDiff)
event = s.sentenceToEventDict(sem, sentence)
print event
print event_groupings

print checkGoodSentence(sem, sentence, event_groupings)

{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'Mary'}
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}]
True


What the above means is that the grouping structure that we have generated 'accepts' the sentence 'Mary ate the potato' after being trained on the 3 sentences above, which is exactly what we were looking for!

However, we want to be able to hypothesize that 'potato' is a good word to fill in for 'Mary ate the \_'. So instead of starting with 'Mary ate the potato', let's start with 'Mary ate the \_', and check the semantic validity of every word in the lexicon.

In [77]:
def test(sem, sentences, event_list):
    results = {}
    guess_words = s.getTerminals(sem) #all words in the lexicon
    for sentence in sentences:
        words = sentence.split()
        filler_i = words.index('_')
        without_filler = words[:filler_i] + words[filler_i+1:] #Mary ate the _
        good_hypotheses = []
        for guess_word in guess_words:
            all_words = without_filler[:filler_i] + [guess_word] + without_filler[filler_i:]
            guess_sentence = ' '.join(all_words)
            if checkGoodSentence(sem, guess_sentence, event_list):
                good_hypotheses.append(guess_sentence)
        results[sentence] = good_hypotheses
    return results

results = test(sem, ['Mary ate the _'], event_groupings)
print results

{'Mary ate the _': ['Mary ate the potato', 'Mary ate the tomato']}


This is doing what we want, but ideally we would like to extend our groupings to more than just the lexicon that we have written down.

This is where we can use wordnet to generalize our results.

In [78]:
from nltk.corpus import wordnet as wn

def flatten(lst):
    out = []
    for x in lst:
        if type(x) is list:
            out.extend(flatten(x))
        else:
            out.append(x)
    return out

filler_words = map(lambda x:x.split()[-1], results['Mary ate the _'])
synonyms = flatten(map(lambda w: map(lambda x: x.lemma_names(), wn.synsets(w)), filler_words))

sents = ['Mary ate the '+syn for syn in synonyms]
for x in sents: print x

Mary ate the potato
Mary ate the white_potato
Mary ate the Irish_potato
Mary ate the murphy
Mary ate the spud
Mary ate the tater
Mary ate the potato
Mary ate the white_potato
Mary ate the white_potato_vine
Mary ate the Solanum_tuberosum
Mary ate the tomato
Mary ate the tomato
Mary ate the love_apple
Mary ate the tomato_plant
Mary ate the Lycopersicon_esculentum


In [81]:
results = test(sem, ['Mary _ the tomato'], event_groupings)
filler_words = map(lambda x:x.split()[1], results['Mary _ the tomato'])
synonyms = flatten(map(lambda w: map(lambda x: x.lemma_names(), wn.synsets(w)), filler_words))

sents = ['Mary '+syn+' the tomato' for syn in synonyms]
for x in sents: print x

Mary Ate the tomato
Mary eat the tomato
Mary eat the tomato
Mary feed the tomato
Mary eat the tomato
Mary eat the tomato
Mary eat_on the tomato
Mary consume the tomato
Mary eat_up the tomato
Mary use_up the tomato
Mary eat the tomato
Mary deplete the tomato
Mary exhaust the tomato
Mary run_through the tomato
Mary wipe_out the tomato
Mary corrode the tomato
Mary eat the tomato
Mary rust the tomato
Mary Ate the tomato
Mary eat the tomato
Mary eat the tomato
Mary feed the tomato
Mary eat the tomato
Mary eat the tomato
Mary eat_on the tomato
Mary consume the tomato
Mary eat_up the tomato
Mary use_up the tomato
Mary eat the tomato
Mary deplete the tomato
Mary exhaust the tomato
Mary run_through the tomato
Mary wipe_out the tomato
Mary corrode the tomato
Mary eat the tomato
Mary rust the tomato


Above we took each candidate word, and took the union of all synsets of those words. However, this may over-generate, since some synsets of a particular noun might be verbs. We see above that the problem is especially acute for verbs, where we lose subject-verb agreement.

solns -- hit with nltk parser afterwards, or be picky with wordnet category (info in lemmas)

In [None]:
#1 -- change different words
#2 -- look at different wordnet schemes

# potentially filter by part of sentence when expanding synsets of lemmas