# Final Project - Data Driven Relation Extraction Learning

In [205]:
 # this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# assignment folder
FOLDERNAME = 'cs224u/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# this ensures that the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Mounted at /content/drive


In [206]:
import numpy as np
import os
import rel_ext
from sklearn.linear_model import LogisticRegression
import utils
import json
import random
from collections import defaultdict

In [207]:
rel_ext_data_home = os.path.join(sys.path[-1],'data/rel_ext_data')
rel_ext_data_home

'/content/drive/My Drive/cs224u/data/rel_ext_data'

In [208]:
corpus = rel_ext.Corpus(os.path.join(rel_ext_data_home, 'corpus.tsv.gz'))

In [209]:
kb = rel_ext.KB(os.path.join(rel_ext_data_home, 'kb.tsv.gz'))

In [210]:
dataset = rel_ext.Dataset(corpus, kb)

In [211]:
dataset = rel_ext.Dataset(corpus, kb)
dataset

Corpus with 331,696 examples; KB with 45,884 triples

In [212]:
splits = dataset.build_splits(
  split_names=['tiny', 'train', 'dev'],
  # tiny was used during development only
  split_fracs=[0.00, 0.80, 0.20],  
  seed=1,
)

In [213]:
# Expriments with filtering the training corpus by sentence type/topic.
# Sentence types: Declarative, Imperative, Interrogative, Exclamatory
# Sentence topics: History, Economics, Science, Politics, Arts, Sports, Business

class_to_indices = defaultdict(set)
with open(os.path.join(rel_ext_data_home, 'sentence_types_roberta.txt')) as f:
    for line in f.readlines():
        idx, e1, e2, _cls = line.split("\t")
        class_to_indices[_cls.strip()].add(int(idx))

corpus_by_class = {}
for _class, indices in class_to_indices.items():
    examples = []
    for idx in indices:
        examples.append(corpus.examples[idx])
    corpus_by_class[_class] = rel_ext.Corpus(examples)

print("Corpus classes:\n", corpus_by_class)

# Create a superset of permitted corpus examples that will
# be utilized to filter the training corpus.
selected_sentence_type = "Exclamatory"
selected_corpus = corpus_by_class[selected_sentence_type]
train_corpus_superset = set(selected_corpus.examples)
print("Selected Corpus:\n", selected_sentence_type, selected_corpus)

# filter the examples in the training corpus by the selected category
training_corpus = splits["train"].corpus
print("Corpus pre-filter: ",  splits["train"].corpus)
filtered_examples = train_corpus_superset.intersection(training_corpus.examples)
splits["train"].corpus = rel_ext.Corpus(filtered_examples)
print("Corpus post-filter: ",  splits["train"].corpus)

# log some samples for manual verification
sample_sentences = [
    " ".join([e.left, e.mention_1, e.middle, e.mention_2, e.right])
    for e in  random.sample(splits["train"].corpus.examples, 5)
]

print(
  "Sample of filtered sentences:\n", 
  sample_sentences,
)

Corpus classes:
 {'Exclamatory': Corpus with 197,594 examples, 'Declarative': Corpus with 16,682 examples, 'Imperative': Corpus with 113,941 examples, 'Interrogative': Corpus with 3,479 examples}
Selected Corpus:
 Exclamatory Corpus with 197,594 examples
Corpus pre-filter:  Corpus with 266,759 examples
Corpus post-filter:  Corpus with 159,088 examples
Sample of filtered sentences:
 ["although she determined to succeed any task she dose n't think it through before doing most things , she is the guardian of courage her symbol is a star and her main color is red and her zoot is Zora . Haemi Haemi is sweet girl who try 's to see the best in life she unlike Chaney , thinks before she does", 'day a week together to try out some new flies that we might use in the upcoming two day contest that takes place on the Snake River in Wyoming and the South Fork in Idaho . We always get hooked on a couple new fun patterns but in the end , I usually fish a', 'climbs over 3,000ft high . Most rock climber

In [214]:
# Expriments with NELL relations added to the training dataset

# with open(os.path.join(rel_ext_data_home,"nell_extracted.json"), "r") as f:
#     nell_relations = json.loads(f.read())
#     nell_relations = [
#         rel_ext.KBTriple(
#             rel=r["rel"],
#             sbj=r["sbj"],
#             obj=r["obj"],
#         )
#         for r in nell_relations
#         if r["sbj"] != r["obj"]
#     ]

# print("Training KB before: ", splits["train"].kb)
# training_kb = splits["train"].kb
# training_kb.kb_triples.extend(nell_relations)

# # TODO: add rule based cleanup to avoid invalid relations 
# # resulting from fuzzy string match 
# training_kb._collect_all_entity_pairs()
# training_kb._index_kb_triples_by_relation()
# training_kb._index_kb_triples_by_entities()
# print("Training KB after NELL relation addition: ", splits["train"].kb)

In [215]:
# Expriments with docred relations added to the training dataset

# with open(os.path.join(rel_ext_data_home,"docred_extracted.json"), "r") as f:
#     docred_relations = json.loads(f.read())

#     # entities known to contain invalid relations
#     bad_entities = set(["Lo", "Eni", "Eni!", "Lo!"])

#     docred_relations = [
#         rel_ext.KBTriple(
#             rel=r["rel"],
#             sbj=r["sbj"],
#             obj=r["obj"],
#         )
#         for r in docred_relations 
#         if r['sbj'] != r['obj'] 
#         and not bad_entities.intersection(r.values())
#     ]
  
# print("Training KB before: ", splits["train"].kb)
# training_kb = splits["train"].kb
# training_kb.kb_triples.extend(docred_relations)

# # TODO: add rule based cleanup to avoid invalid relations 
# # resulting from fuzzy string match 
# training_kb._collect_all_entity_pairs()
# training_kb._index_kb_triples_by_relation()
# training_kb._index_kb_triples_by_entities()
# print("Training KB after DocRED relation addition: ", splits["train"].kb)

In [216]:
# random.sample(docred_relations, 10)
# splits["train"].count_relation_combinations()

### Hand-build feature functions

In [217]:
def simple_bag_of_words_featurizer(kbt, corpus, feature_counter):
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split(' '):
            feature_counter[word] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in ex.middle.split(' '):
            feature_counter[word] += 1
    return feature_counter

In [218]:
featurizers = [simple_bag_of_words_featurizer]

In [219]:
model_factory = lambda: LogisticRegression(fit_intercept=True, solver='liblinear')

In [220]:
baseline_results, data_results = rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=featurizers,
    model_factory=model_factory,
    verbose=True)

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.881      0.391      0.704        340       5716
author                    0.677      0.583      0.656        509       5885
capital                   0.737      0.295      0.567         95       5471
contains                  0.627      0.847      0.662       3904       9280
film_performance          0.628      0.633      0.629        766       6142
founders                  0.582      0.363      0.520        380       5756
genre                     0.500      0.176      0.366        170       5546
has_sibling               0.791      0.220      0.521        499       5875
has_spouse                0.786      0.340      0.623        594       5970
is_a                      0.547      0.221      0.423        497       5873
nationality               0.400      0.186      0.325        301       5677
parents     

Studying model weights might yield insights:

In [221]:
rel_ext.examine_model_weights(baseline_results)

Highest and lowest feature weights for relation adjoins:

     2.284 Taluks
     2.223 Córdoba
     2.198 Valais
     ..... .....
    -1.179 America
    -1.247 his
    -1.458 who

Highest and lowest feature weights for relation author:

     3.098 book
     2.537 author
     2.269 philosopher
     ..... .....
    -2.432 and
    -2.434 during
    -2.834 or

Highest and lowest feature weights for relation capital:

     4.006 capital
     1.757 city
     1.471 km
     ..... .....
    -1.129 or
    -1.139 by
    -1.380 and

Highest and lowest feature weights for relation contains:

     2.942 bordered
     2.889 mainland
     2.602 districts
     ..... .....
    -2.405 who
    -2.463 or
    -2.714 film

Highest and lowest feature weights for relation film_performance:

     4.423 appears
     4.031 starring
     3.537 opposite
     ..... .....
    -2.173 and
    -2.207 ;
    -2.633 or

Highest and lowest feature weights for relation founders:

     4.210 founded
     4.180 founder
     2.

### Distributed representations

This simple baseline sums the GloVe vector representations for all of the words in the "middle" span and feeds those representations into the standard `LogisticRegression`-based `model_factory`. The crucial parameter that enables this is `vectorize=False`. This essentially says to `rel_ext.experiment` that your featurizer or your model will do the work of turning examples into vectors; in that case, `rel_ext.experiment` just organizes these representations by relation type.

In [222]:
GLOVE_HOME = os.path.join(sys.path[-1],'data/glove.6B')

In [223]:
glove_lookup = utils.glove2dict(
    os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))

In [224]:
def glove_middle_featurizer(kbt, corpus, np_func=np.sum):
    reps = []
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split():
            rep = glove_lookup.get(word)
            if rep is not None:
                reps.append(rep)
    # A random representation of the right dimensionality if the
    # example happens not to overlap with GloVe's vocabulary:
    if len(reps) == 0:
        dim = len(next(iter(glove_lookup.values())))
        return utils.randvec(n=dim)
    else:
        return np_func(reps, axis=0)

In [225]:
glove_results = rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=[glove_middle_featurizer],
    vectorize=False, # Crucial for this featurizer!
    verbose=True)

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.823      0.465      0.713        340       5716
author                    0.813      0.436      0.693        509       5885
capital                   0.657      0.242      0.489         95       5471
contains                  0.537      0.793      0.574       3904       9280
film_performance          0.725      0.320      0.578        766       6142
founders                  0.720      0.224      0.499        380       5756
genre                     0.542      0.076      0.244        170       5546
has_sibling               0.785      0.248      0.548        499       5875
has_spouse                0.767      0.360      0.626        594       5970
is_a                      0.658      0.151      0.393        497       5873
nationality               0.592      0.203      0.428        301       5677
parents     

With the same basic code design, one can also use the PyTorch models included in the course repo, or write new ones that are better aligned with the task. For those models, it's likely that the featurizer will just return a list of tokens (or perhaps a list of lists of tokens), and the model will map those into vectors using an embedding.

### Different model factory [1 points]

The code in `rel_ext` makes it very easy to experiment with other classifier models: one need only redefine the `model_factory` argument. This question asks you to assess a [Support Vector Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).

__To submit:__ A wrapper function `run_svm_model_factory` that does the following: 

1. Uses `rel_ext.experiment` with the model factory set to one based in an `SVC` with `kernel='linear'` and all other arguments left with default values. 
1. Trains on the 'train' part of `splits`.
1. Assesses on the `dev` part of `splits`.
1. Uses `featurizers` as defined above. 
1. Returns the return value of `rel_ext.experiment` for this set-up.

The function `test_run_svm_model_factory` will check that your function conforms to these general specifications.

In [226]:
def run_svm_model_factory():
    
    ##### YOUR CODE HERE
    model_factory_svc = lambda: SVC(kernel='linear')
    
    return rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=featurizers,
    model_factory=model_factory_svc,
    verbose=True)



In [227]:
def test_run_svm_model_factory(run_svm_model_factory):
    results = run_svm_model_factory()
    assert 'featurizers' in results, \
        "The return value of `run_svm_model_factory` seems not to be correct"
    # Check one of the models to make sure it's an SVC:
    assert 'SVC' in results['models']['adjoins'].__class__.__name__, \
        "It looks like the model factor wasn't set to use an SVC."

In [228]:
#if 'IS_GRADESCOPE_ENV' not in os.environ:
    #test_run_svm_model_factory(run_svm_model_factory)

### Directional unigram features [1.5 points]

The current bag-of-words representation makes no distinction between "forward" and "reverse" examples. But, intuitively, there is big difference between _X and his son Y_ and _Y and his son X_. This question asks you to modify `simple_bag_of_words_featurizer` to capture these differences. 

__To submit:__

1. A feature function `directional_bag_of_words_featurizer` that is just like `simple_bag_of_words_featurizer` except that it distinguishes "forward" and "reverse". To do this, you just need to mark each word feature for whether it is derived from a subject–object example or from an object–subject example.  The included function `test_directional_bag_of_words_featurizer` should help verify that you've done this correctly.

2. A call to `rel_ext.experiment` with `directional_bag_of_words_featurizer` as the only featurizer. (Aside from this, use all the default values for `rel_ext.experiment` as exemplified above in this notebook.)

3. `rel_ext.experiment` returns some of the core objects used in the experiment. How many feature names does the `vectorizer` have for the experiment run in the previous step? Include the code needed for getting this value. (Note: we're partly asking you to figure out how to get this value by using the sklearn documentation, so please don't ask how to do it!)

In [229]:
def directional_bag_of_words_featurizer(kbt, corpus, feature_counter):
    # Append these to the end of the keys you add/access in
    # `feature_counter` to distinguish the two orders. You'll
    # need to use exactly these strings in order to pass
    # `test_directional_bag_of_words_featurizer`.
    subject_object_suffix = "_SO"
    object_subject_suffix = "_OS"
    
    ##### YOUR CODE HERE
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split(' '):
            haystack = word + subject_object_suffix
            feature_counter[haystack] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        print(ex)
        print(ex.middle)
        for word in ex.middle.split(' '):
            print(word)
            haystack = word + object_subject_suffix
            feature_counter[haystack] += 1
    return feature_counter


# Call to `rel_ext.experiment`:
##### YOUR CODE HERE
#results = rel_ext.experiment(
#    splits,
#    train_split='train',
#    test_split='dev',
#    featurizers=[directional_bag_of_words_featurizer],
#    model_factory=model_factory,
#    verbose=True)

In [230]:
def test_directional_bag_of_words_featurizer(corpus):
    from collections import defaultdict
    kbt = rel_ext.KBTriple(rel='worked_at', sbj='Randall_Munroe', obj='xkcd')
    feature_counter = defaultdict(int)
    # Make sure `feature_counter` is being updated, not reinitialized:
    feature_counter['is_OS'] += 5
    feature_counter = directional_bag_of_words_featurizer(kbt, corpus, feature_counter)
    expected = defaultdict(
        int, {'is_OS':6,'a_OS':1,'webcomic_OS':1,'created_OS':1,'by_OS':1})
    assert feature_counter == expected, \
        "Expected:\n{}\nGot:\n{}".format(expected, feature_counter)

In [231]:
if 'IS_GRADESCOPE_ENV' not in os.environ:
    test_directional_bag_of_words_featurizer(corpus)

Example(entity_1='xkcd', entity_2='Randall_Munroe', left='xkcd - Wikiquote xkcd From Wikiquote Jump to : navigation , search', mention_1='xkcd', middle='is a webcomic created by', mention_2='Randall Munroe', right=", a former contractor for NASA . He describes it as `` a webcomic of romance , sarcasm , math , and language . '' Munroe states there is no particular", left_POS='xkcd/NN -/: Wikiquote/NNP xkcd/VBD From/IN Wikiquote/NNP Jump/NN to/TO :/: navigation/NN ,/, search/NN', mention_1_POS='xkcd/NN', middle_POS='is/VBZ a/DT webcomic/JJ created/VBN by/IN', mention_2_POS='Randall/NNP Munroe/NNP', right_POS=",/, a/DT former/JJ contractor/NN for/IN NASA/NNP ./. He/PRP describes/VBZ it/PRP as/IN ``/`` a/DT webcomic/JJ of/IN romance/NN ,/, sarcasm/NN ,/, math/NN ,/, and/CC language/NN ./. ''/'' Munroe/NNP states/VBZ there/EX is/VBZ no/DT particular/JJ")
is a webcomic created by
is
a
webcomic
created
by


### The part-of-speech tags of the "middle" words [1.5 points]

Our corpus distribution contains part-of-speech (POS) tagged versions of the core text spans. Let's begin to explore whether there is information in these sequences, focusing on `middle_POS`.

__To submit:__

1. A feature function `middle_bigram_pos_tag_featurizer` that is just like `simple_bag_of_words_featurizer` except that it creates a feature for bigram POS sequences. For example, given 

  `The/DT dog/N napped/V`
  
   we obtain the list of bigram POS sequences
  
   `b = ['<s> DT', 'DT N', 'N V', 'V </s>']`. 
   
   Of course, `middle_bigram_pos_tag_featurizer` should return count dictionaries defined in terms of such bigram POS lists, on the model of `simple_bag_of_words_featurizer`.  Don't forget the start and end tags, to model those environments properly! The included function `test_middle_bigram_pos_tag_featurizer` should help verify that you've done this correctly.

2. A call to `rel_ext.experiment` with `middle_bigram_pos_tag_featurizer` as the only featurizer. (Aside from this, use all the default values for `rel_ext.experiment` as exemplified above in this notebook.)

In [232]:
def directional_middle_bigram_pos_tag_featurizer(kbt, corpus, feature_counter):
     # Append these to the end of the keys you add/access in
    # `feature_counter` to distinguish the two orders. You'll
    # need to use exactly these strings in order to pass
    # `test_directional_bag_of_words_featurizer`.
    subject_object_suffix = "_SO"
    object_subject_suffix = "_OS"
    ##### YOUR CODE HERE
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in get_tag_bigrams(ex.middle_POS):
            haystack = word + subject_object_suffix
            feature_counter[haystack] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in get_tag_bigrams(ex.middle_POS):
            haystack = word + object_subject_suffix
            feature_counter[haystack] += 1
    
    return feature_counter

def middle_bigram_pos_tag_featurizer(kbt, corpus, feature_counter):

    ##### YOUR CODE HERE
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in get_tag_bigrams(ex.middle_POS):
            feature_counter[word] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in get_tag_bigrams(ex.middle_POS):
            feature_counter[word] += 1
    
    return feature_counter
 

def get_tag_bigrams(s):
    """Suggested helper method for `middle_bigram_pos_tag_featurizer`.
    This should be defined so that it returns a list of str, where each
    element is a POS bigram."""
    # The values of `start_symbol` and `end_symbol` are defined
    # here so that you can use `test_middle_bigram_pos_tag_featurizer`.
    start_symbol = "<s>"
    end_symbol = "</s>"

    ##### YOUR CODE HERE
    tags = get_tags(s)
    return_list = []
    for i in range(len(tags)):
        if i == 0:
            return_list.append(start_symbol + ' ' + tags[i])
        elif i == len(tags) - 1:
            prev_i = i - 1
            return_list.append(tags[prev_i] + ' ' + tags[i])
            return_list.append(tags[i] + ' ' + end_symbol)
        else:
            prev_i = i - 1
            return_list.append(tags[prev_i] + ' ' + tags[i])
    return return_list
        


def get_tags(s):
    """Given a sequence of word/POS elements (lemmas), this function
    returns a list containing just the POS elements, in order.
    """
    return [parse_lem(lem)[1] for lem in s.strip().split(' ') if lem]


def parse_lem(lem):
    """Helper method for parsing word/POS elements. It just splits
    on the rightmost / and returns (word, POS) as a tuple of str."""
    return lem.strip().rsplit('/', 1)

# Call to `rel_ext.experiment`:
##### YOUR CODE HERE
results = rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=[middle_bigram_pos_tag_featurizer],
    model_factory=model_factory,
    verbose=True)


results_2 = rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=[directional_middle_bigram_pos_tag_featurizer],
    model_factory=model_factory,
    verbose=True)

relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.807      0.356      0.644        340       5716
author                    0.566      0.444      0.537        509       5885
capital                   0.500      0.147      0.338         95       5471
contains                  0.611      0.847      0.647       3904       9280
film_performance          0.543      0.535      0.541        766       6142
founders                  0.309      0.187      0.273        380       5756
genre                     0.436      0.141      0.308        170       5546
has_sibling               0.624      0.146      0.377        499       5875
has_spouse                0.579      0.264      0.468        594       5970
is_a                      0.435      0.215      0.361        497       5873
nationality               0.159      0.060      0.120        301       5677
parents     

In [233]:
def test_middle_bigram_pos_tag_featurizer(corpus):
    from collections import defaultdict
    kbt = rel_ext.KBTriple(rel='worked_at', sbj='Randall_Munroe', obj='xkcd')
    feature_counter = defaultdict(int)
    # Make sure `feature_counter` is being updated, not reinitialized:
    feature_counter['<s> VBZ'] += 5
    feature_counter = middle_bigram_pos_tag_featurizer(kbt, corpus, feature_counter)
    expected = defaultdict(
        int, {'<s> VBZ':6,'VBZ DT':1,'DT JJ':1,'JJ VBN':1,'VBN IN':1,'IN </s>':1})
    assert feature_counter == expected, \
        "Expected:\n{}\nGot:\n{}".format(expected, feature_counter)

In [234]:
if 'IS_GRADESCOPE_ENV' not in os.environ:
    test_middle_bigram_pos_tag_featurizer(corpus)

### Bag of Synsets [2 points]

The following allows you to use NLTK's WordNet API to get the synsets compatible with _dog_ as used as a noun:

```
from nltk.corpus import wordnet as wn
dog = wn.synsets('dog', pos='n')
dog
[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01')]
```

This question asks you to create synset-based features from the word/tag pairs in `middle_POS`.

__To submit:__

1. A feature function `synset_featurizer` that is just like `simple_bag_of_words_featurizer` except that it returns a list of synsets derived from `middle_POS`. Stringify these objects with `str` so that they can be `dict` keys. Use `convert_tag` (included below) to convert tags to `pos` arguments usable by `wn.synsets`. The included function `test_synset_featurizer` should help verify that you've done this correctly.

2. A call to `rel_ext.experiment` with `synset_featurizer` as the only featurizer. (Aside from this, use all the default values for `rel_ext.experiment`.)

In [235]:
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')
def synset_featurizer(kbt, corpus, feature_counter):

    ##### YOUR CODE HERE
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in get_synsets(ex.middle_POS):
            feature_counter[word] += 1
        
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in get_synsets(ex.middle_POS):
            feature_counter[word] += 1
    return feature_counter


def get_synsets(s):
    """Suggested helper method for `synset_featurizer`. This should
    be completed so that it returns a list of stringified Synsets
    associated with elements of `s`.
    """
    # Use `parse_lem` from the previous question to get a list of
    # (word, POS) pairs. Remember to convert the POS strings.
    wt = [parse_lem(lem) for lem in s.strip().split(' ') if lem]
    
    return_list = []
    ##### YOUR CODE HERE
    for i in range(len(wt)):
        text = wt[i][0]
        tag = convert_tag(wt[i][1])
        for x in wn.synsets(text, pos=tag):
            return_list.append(str(x))
    return return_list


def convert_tag(t):
    """Converts tags so that they can be used by WordNet:

    | Tag begins with | WordNet tag |
    |-----------------|-------------|
    | `N`             | `n`         |
    | `V`             | `v`         |
    | `J`             | `a`         |
    | `R`             | `r`         |
    | Otherwise       | `None`      |
    """
    if t[0].lower() in {'n', 'v', 'r'}:
        return t[0].lower()
    elif t[0].lower() == 'j':
        return 'a'
    else:
        return None


# Call to `rel_ext.experiment`:
##### YOUR CODE HERE
results = rel_ext.experiment(
    splits,
    train_split='train',
    test_split='dev',
    featurizers=[synset_featurizer],
    model_factory=model_factory,
    verbose=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




relation              precision     recall    f-score    support       size
------------------    ---------  ---------  ---------  ---------  ---------
adjoins                   0.777      0.318      0.603        340       5716
author                    0.738      0.593      0.704        509       5885
capital                   0.659      0.305      0.535         95       5471
contains                  0.536      0.862      0.580       3904       9280
film_performance          0.743      0.567      0.700        766       6142
founders                  0.707      0.387      0.606        380       5756
genre                     0.483      0.253      0.409        170       5546
has_sibling               0.642      0.204      0.449        499       5875
has_spouse                0.734      0.301      0.570        594       5970
is_a                      0.607      0.233      0.460        497       5873
nationality               0.444      0.120      0.288        301       5677
parents     

In [236]:
def test_synset_featurizer(corpus):
    from collections import defaultdict
    kbt = rel_ext.KBTriple(rel='worked_at', sbj='Randall_Munroe', obj='xkcd')
    feature_counter = defaultdict(int)
    # Make sure `feature_counter` is being updated, not reinitialized:
    feature_counter["Synset('be.v.01')"] += 5
    feature_counter = synset_featurizer(kbt, corpus, feature_counter)
    # The full return values for this tend to be long, so we just
    # test a few examples to avoid cluttering up this notebook.
    test_cases = {
        "Synset('be.v.01')": 6,
        "Synset('embody.v.02')": 1
    }
    for ss, expected in test_cases.items():
        result = feature_counter[ss]
        assert result == expected, \
            "Incorrect count for {}: Expected {}; Got {}".format(ss, expected, result)

In [237]:
if 'IS_GRADESCOPE_ENV' not in os.environ:
    test_synset_featurizer(corpus)

# Original system

In [238]:

import sklearn.linear_model
import sklearn.svm
from sklearn.neighbors import NearestCentroid
SGD_factory = lambda: sklearn.linear_model.SGDClassifier(loss='hinge')
challenger_factory = lambda: NearestCentroid()

def directional_bag_of_words_featurizer_original_system(kbt, corpus, feature_counter):
    # Append these to the end of the keys you add/access in
    # `feature_counter` to distinguish the two orders. You'll
    # need to use exactly these strings in order to pass
    # `test_directional_bag_of_words_featurizer`.
    subject_object_suffix = "_SO"
    object_subject_suffix = "_OS"
    
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split(' '):
            haystack = word + subject_object_suffix
            feature_counter[haystack] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in ex.middle.split(' '):
            haystack = word + object_subject_suffix
            feature_counter[haystack] += 1
    return feature_counter

def directional_middle_bigram_pos_tag_featurizer(kbt, corpus, feature_counter):
     # Append these to the end of the keys you add/access in
    # `feature_counter` to distinguish the two orders. You'll
    # need to use exactly these strings in order to pass
    # `test_directional_bag_of_words_featurizer`.
    subject_object_suffix = "_SO"
    object_subject_suffix = "_OS"

    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in get_tag_bigrams(ex.middle_POS):
            haystack = word + subject_object_suffix
            feature_counter[haystack] += 1
    for ex in corpus.get_examples_for_entities(kbt.obj, kbt.sbj):
        for word in get_tag_bigrams(ex.middle_POS):
            haystack = word + object_subject_suffix
            feature_counter[haystack] += 1
    
    return feature_counter


def gridsearch(featurizers: list, model_factories: list, splits = None, 
               train_split: str = 'train', test_split: str = 'dev'
):
    '''
    Runs Grid Search 
    
    splits: param for rel_ext.experiment()
    train_split: param for rel_ext.experiment()
    test_split: param for rel_ext.experiment()
    featurizers: list of feauturizers that will be used for the grid search
    model_factories: list of model_factories that will be used for the grid search
    '''
    best_result = 0
    best_combination = {'feauturizer': None, 'factory': None}
    experiment = 0
    for feauturizer in featurizers:
        for model_factory in model_factories:
            experiment += 1;
            train_results, evaluation_result = rel_ext.experiment(
                splits,
                train_split=train_split,
                test_split=test_split,
                featurizers=[feauturizer],
                model_factory=model_factory,
                vectorize=True,
                verbose=False)
            print(
              'Experiment #', 
              experiment, '\n', 
              feauturizer.__name__, model_factory(), 
              evaluation_result
            )
            if evaluation_result > best_result:
                best_result = evaluation_result
                best_combination['feauturizer'] = feauturizer.__name__
                best_combination['feauturizer_model'] = feauturizer
                best_combination['factory'] = model_factory()
                best_combination['factory_model'] = model_factory
    
    print('----BEST SYSTEM----')
    print('Best performing system score:', best_result)
    print(best_combination)
    
    rel_ext.experiment(
      splits,
      train_split=train_split,
      test_split=test_split,
      featurizers=[best_combination['feauturizer_model']],
      model_factory=best_combination['factory_model'],
      vectorize=True,
      verbose=True,
    )

    

# Run the gridsearch and expriments on the best model
# %time \
gridsearch(
  [directional_bag_of_words_featurizer_original_system, simple_bag_of_words_featurizer],
  [lambda: sklearn.linear_model.SGDClassifier(loss='hinge'), lambda: NearestCentroid()],
  splits,
  'train',
  'dev',
)

Experiment # 1 
 directional_bag_of_words_featurizer_original_system SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) 0.5376881576267383
Experiment # 2 
 directional_bag_of_words_featurizer_original_system NearestCentroid(metric='euclidean', shrink_threshold=None) 0.21924236308360545
Experiment # 3 
 simple_bag_of_words_featurizer SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
       