In [16]:
import semantic as s
import semantic_rule_set
import syntactic_and_semantic_rules

import copy

In [17]:
#setup
training_sentences_file = 'training.txt'
testing_sentences_file = 'testing.txt'
show_database = False
validate = False
gui = False

with open(training_sentences_file, 'r') as f:
    training_sentences = [x.strip() for x in f]
with open(testing_sentences_file, 'r') as f:
    testing_sentences = [x.strip() for x in f]

sem = semantic_rule_set.SemanticRuleSet()
sem = syntactic_and_semantic_rules.addLexicon(sem)

### Generate Event Structures
First we generate event structures from sentences, which we store in a list of dictionaries for simplicity.

In [18]:
print training_sentences
events = map(lambda sent: s.sentenceToEventDict(sem, sent), training_sentences)
for e in events:
    print e

['John ate the potato', 'John ate the tomato', 'Mary ate the tomato']
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}


### Simplest Strategy: No Grouping

In [19]:
#Parse each sentence in training data
def train(sem, sentences, groupEvents):
    event_list = []
    for sentence in sentences:
        try:
           new_event_dict = s.sentenceToEventDict(sem, sentence)
           event_list = groupEvents(event_list, new_event_dict)
        except Exception as e:
            # The parser did not return any parse trees.
            raise
    return event_list

def keepSeparate(event_list, new_event_dict):
    return event_list + [new_event_dict]

event_groupings = train(sem, training_sentences, keepSeparate)
for g in event_groupings:
    print g

{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}


### One Difference Groupings
This is a very conservative form of grouping. If two training sentences the same event structure but differ across one feature, we group together the values of that feature.

In [39]:
def groupIfOneDiff(event_list, new_event): #if different structure, do not match
    #maybe only do after reaching a certain size
    new_event_list = copy.deepcopy(event_list)
    merged = False
    #try merging in
    for i in range(len(event_list)): #try to match with event_list[i]
        event = event_list[i]
        if set(event.keys()) == set(new_event.keys()):
            unequal_count = 0
            for feat in event.keys():
                if new_event[feat] not in event[feat]:
                    unequal_feat = feat
                    unequal_count += 1
            if unequal_count == 0: merged = True
            elif unequal_count == 1: #merge into previous
                new_event_list[i][unequal_feat].add(new_event[unequal_feat])
                merged = True
    #make new spot
    if not merged:
        new_event_list.append({k:set([v]) for k,v in new_event.iteritems()})
    return new_event_list

event_groupings = train(sem, training_sentences, groupIfOneDiff)
for e in events:
    print e
for g in event_groupings:
    print g

{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}


Note that the above only produces one output grouping (as opposed to two groupings, composed of sentences (1,2) and (2,3)).
This is because we are applying the groupings iteratively. We loop through the events, and compare the current event with the groupings that we have collected up to that point. The comparison in this case is not checking for equality between values of a common feature, but rather it is checking for inclusion of the current event's feature values within groupings of that feature.
```python
for feat in event.keys():
    if new_event[feat] not in event[feat]:
        unequal_feat = feat
```
This part of the previous method demonstrates this inclusion checking.

This implies that the same training sentences, in different orders, can lead to different event groupings.

In [40]:
def rotate(lst): 
    return [lst[-1]] + lst[:-1]
print training_sentences
print training_sentences[::-1]
event_groupings_1 = train(sem, training_sentences, groupIfOneDiff)
event_groupings_2 = train(sem, rotate(training_sentences), groupIfOneDiff)

print event_groupings_1 #creates 1 group
print event_groupings_2 #creates 2 groups

['John ate the potato', 'John ate the tomato', 'Mary ate the tomato']
['Mary ate the tomato', 'John ate the tomato', 'John ate the potato']
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}]
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato']), 'agent': set(['John', 'Mary'])}, {'action': set(['eat']), 'patient': set(['tomato', 'potato']), 'tense': set(['past']), 'agent': set(['John'])}]


This grouping pattern is related to near-miss learning:

The reason that event_groupings_2 creates 2 instead of 1 grouping is that the 1st and 2nd sentence differ from each other is 2 ways, instead of just 1.
    
But maybe this is a bit too conservative of an assumption. Alternately we could try grouping when seeing two differences.

In [41]:
def groupIfOneOrTwoDiffs(event_list, new_event): #if different structure, do not match
    #maybe only do after reaching a certain size
    new_event_list = copy.deepcopy(event_list)
    merged = False
    #try merging in
    for i in range(len(event_list)): #try to match with event_list[i]
        event = event_list[i]
        if set(event.keys()) == set(new_event.keys()):
            unequal_count = 0
            for feat in event.keys():
                if new_event[feat] not in event[feat]:
                    if unequal_count == 0:
                        unequal_feat_1 = feat
                    if unequal_count == 1:
                        unequal_feat_2 = feat
                    unequal_count += 1
            if unequal_count == 0: merged = True
            elif unequal_count == 1: #merge into previous
                new_event_list[i][unequal_feat_1].add(new_event[unequal_feat_1])
                merged = True
            elif unequal_count == 2: #merge into previous
                new_event_list[i][unequal_feat_1].add(new_event[unequal_feat_1])
                new_event_list[i][unequal_feat_2].add(new_event[unequal_feat_2])
                merged = True
    #make new spot
    if not merged:
        new_event_list.append({k:set([v]) for k,v in new_event.iteritems()})
    return new_event_list

events = map(lambda sent: s.sentenceToEventDict(sem, sent), rotate(training_sentences))
for e in events:
    print e
event_groupings = train(sem, rotate(training_sentences), groupIfOneOrTwoDiffs)
print event_groupings

{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'Mary'}
{'action': 'eat', 'patient': 'potato', 'tense': 'past', 'agent': 'John'}
{'action': 'eat', 'patient': 'tomato', 'tense': 'past', 'agent': 'John'}
test
2
0
[{'action': set(['eat']), 'tense': set(['past']), 'patient': set(['tomato', 'potato']), 'agent': set(['John', 'Mary'])}]


In [3]:
#different ideas to try
#ideas about grouping

#1 -- try grouping by looser requirements
# do analysis, every step of the way
# with example sentences

#2 -- try learning what features are not required
# a systematic way to do looser groupings