In [1]:
import os
import re
import sys
import pickle
import numpy as np
import itertools
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import utils
from nltk.tree import Tree
from nli_rnn import ClassifierRNN

In [2]:
snli_sample_src = os.path.join('nli-data', 'snli_1.0_cs224u_sample.pickle')

# Load the dataset: a dict with keys `train`, `dev`, and `vocab`. The first
# two are lists of `dict`s sampled from the SNLI JSONL files. The third is
# the complete vocabulary of the leaves in the trees for `train` and `dev`.
snli_sample = pickle.load(open(snli_sample_src, 'rb'))

In [70]:
import json
devs = []
with open('snli_1.0/snli_1.0_dev.jsonl') as f:
    for line in f:
        j = json.loads(line)
        devs.append(j)
        
trains = []
with open('snli_1.0/snli_1.0_train.jsonl') as f:
    for line in f:
        j = json.loads(line)
        trains.append(j)



In [123]:
snli = {}
snli['dev'] = devs
snli['train'] = trains

In [124]:
# import string
import nltk
# exclude = set(string.punctuation)

# an alternative method: still doesn't seem to give enough vocab. below (somewhere) i just take the vocab from the
# build_rnn_dataset output
vc = nltk.word_tokenize(' '.join([sentence['sentence1']+ ' '+sentence['sentence2'] for sentence in (snli['train'] + snli['dev'])]))

# vc = set()
# for sentence in snli['train'][:10000]:
#     things = sentence['sentence1']
# #     for thing in things:
# #         thing = ''.join(ch for ch in thing if ch not in exclude)
# #         vc.add(thing)
#     
# #     for thing in nltk.word_tokenize(things):
# #         vc.add(thing)
# # #     for thing in things:
# # #         thing = ''.join(ch for ch in thing if ch not in exclude)
# # #         vc.add(thing)
# #     for thing in nltk.word_tokenize(things):
# #         vc.add(thing)
        

snli['vocab'] = list(set(vc))


In [145]:

print (str2tree(snli['train'][0]['sentence2_binary_parse'])).subtrees()[0]

TypeError: 'generator' object has no attribute '__getitem__'

In [249]:
def snli_reader(sample):
    """Reader for SNLI data. `sample` just needs to be an iterator over
    the SNLI JSONL files. For this notebook, it will always be 
    `snli_sample`, but, for example, the following should work for the 
    corpus files:
    
    import json    
    def sample(src_filename):
        for line in open(src_filename):
            yield json.loads(line)
    
    Yields
    ------
    tuple
        (tree1, tree2, label), where the trees are from `str2tree` and
        label is in `LABELS` above.
      
    """
    for d in sample:
        yield (str2tree(d['sentence1_parse']), 
               str2tree(d['sentence2_parse']),
               d['gold_label'])
        
def train_reader():
    """Convenience function for reading just the training data."""
    return snli_reader(snli['train'])

def dev_reader():
    """Convenience function for reading just the dev data."""
    return snli_reader(snli['dev'])


In [126]:
def str2tree(s):
    """Map str `s` to an `nltk.tree.Tree` instance. The assumption is that 
    `s` represents a standard Penn-style tree."""
    return Tree.fromstring(s)

In [127]:
LABELS = ['contradiction', 'entailment', 'neutral']

In [128]:
def build_rnn_dataset(reader):
    """Build RNN datasets.
    
    Parameters
    ----------
    reader
        SNLI iterator like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    Returns
    -------
    list of tuples
        The first member of each tuple is a list of strings (the
        concatenated leaves) and the second is an np.array 
        (dimension 3) with a single 1 for the true class and 0s
        in the other two positions
       
    """    
    dataset = []
    for (t1, t2, label) in reader():
        seq = t1.leaves() + t2.leaves()
        y_ = np.zeros(3)
        if label == '-':
            label = 'neutral'
        y_[LABELS.index(label)] = 1.0
        dataset.append((seq, y_))
    return dataset

In [130]:
def rnn_model_evaluation(mod, assess, labels=LABELS):
    """Asssess a trained `ClassifierRNN`.
    
    Parameters
    ----------
    mod : `ClassifierRNN`
        Should be a model trained on data in the same format as
        `assess`.
    
    assess : list
        A list of (seq, label) pairs, where seq is a sequence of
        words and label is a one-hot vector giving the label.        
    
    """    
    # Assessment:
    gold = []
    predictions = []    
    for seq, y_ in assess:
        # The gold labels are vectors. Get the index of the single 1
        # and look up its string in `LABELS`:
        gold.append(labels[np.argmax(y_)])
        # `predict` returns the index of the highest score.
        p = mod.predict(seq) 
        predictions.append(labels[p])
    # Report:
    return classification_report(gold, predictions)

In [264]:
def rnn_experiment(
        train_inp,
        dev_inp,
        vocab, 
        embedding, 
        hidden_dim, 
        eta, 
        maxiter):
    """Classifier RNN experiments.
    
    Parameters
    ----------
    vocab : list of str
        Must contain every word we'll encounter in training or assessment.
        
    embedding : np.array
        Embedding matrix for `vocab`. The ith row gives the input 
        representation for the ith member of vocab. Thus, `embedding`
        must have the same row count as the length of vocab. Its
        columns can be any length. (That is, the input word 
        representations can be any length.)
        
    hidden_dim : int (default: 10)
        Dimensionality of the hidden representations. This is a
        parameter to `ClassifierRNN`.
        
    eta : float (default: 0.05)
        The learning rate. This is a parameter to `ClassifierRNN`.       
        
    maxiter : int (default: 10)
        Maximum number of training epochs. This is a parameter 
        to `ClassifierRNN`.       
        
    Returns
    -------
    str
        A formatted `sklearn` `classification_report`.
    
    """
    # Training:
    train = build_rnn_dataset(train_inp)       
    mod = ClassifierRNN(
        vocab, 
        embedding, 
        hidden_dim=hidden_dim, 
        eta=eta,
        maxiter=maxiter)
    mod.fit(train)    
    # Assessment:
    assess = build_rnn_dataset(dev_inp) 
    return rnn_model_evaluation(mod, assess)

In [171]:
vc = set()

for x in build_rnn_dataset(train_reader):
    vc = vc.union(set(x[0]))
        
for x in build_rnn_dataset(dev_reader):
    vc = vc.union(set(x[0]))
        
snli['vocab'] = list(vc)
# vc = set()
# print [x[0] for x in build_rnn_dataset(train_reader)[:5]]
# a = [[u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'training', u'his', u'horse', u'for', u'a', u'competition', u'.'], [u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'at', u'a', u'diner', u',', u'ordering', u'an', u'omelette', u'.'], [u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'outdoors', u',', u'on', u'a', u'horse', u'.'], [u'Children', u'smiling', u'and', u'waving', u'at', u'camera', u'They', u'are', u'smiling', u'at', u'their', u'parents'], [u'Children', u'smiling', u'and', u'waving', u'at', u'camera', u'There', u'are', u'children', u'present']]
# for x in a:
#     print x
#     vc = vc.union(x)
# print vc


In [172]:
vocab = snli['vocab']
# for seq,labels in training_data:
#     print 1
#     break
# vocab = snli_sample['vocab']
# Random embeddings of dimension 10:
randvec_embedding = np.array([utils.randvec(10) for w in vocab])

# A small network, trained for just a few epochs to see how things look:


Finished epoch 1 of 1; error is 1.2139977093

             precision    recall  f1-score   support

contradiction       0.33      0.50      0.40      3278
 entailment       0.33      0.50      0.40      3329
    neutral       0.00      0.00      0.00      3393

avg / total       0.22      0.33      0.26     10000






In [257]:
def train_reader_sample():
    return snli_reader(snli_sample['train'])
def dev_reader_sample(): 
    return snli_reader(snli_sample['dev'])


3000


In [260]:
glove_home = '/Users/reuben/Documents/glove.6B'
GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt'))


def build_glove_embedding(vocab):
    return np.array([GLOVE[x] if x in GLOVE else utils.randvec(50) for x in vocab ])

print 1
glove_embedding = build_glove_embedding(snli_sample['vocab'])

1
1


In [268]:
print rnn_experiment(train_reader,
                     dev_reader,
                     snli['vocab'], 
                     build_glove_embedding(snli['vocab']), 
                     hidden_dim=10, 
                     eta=0.01, 
                     maxiter=10)

Finished epoch 10 of 10; error is 1.11585145787

             precision    recall  f1-score   support

contradiction       0.50      0.01      0.01      3278
 entailment       0.33      0.98      0.50      3329
    neutral       0.46      0.03      0.06      3393

avg / total       0.43      0.34      0.19     10000






In [233]:
def word_cross_product_phi(t1, t2):
    """Basis for cross-product features. This tends to produce pretty 
    dense representations.
    
    Parameters
    ----------
    t1, t2 : `nltk.tree.Tree`
        As given by `str2tree`.
        
    Returns
    -------
    defaultdict
        Maps each (w1, w2) in the cross-product of `t1.leaves()` and 
        `t2.leaves()` to its count. This is a multi-set cross-product
        (repetitions matter).
    
    """
    return Counter([(w1, w2) for w1, w2 in itertools.product(t1.leaves(), t2.leaves())])

In [234]:
def build_linear_classifier_dataset(
        reader,
        phi, 
        vectorizer=None):
    """Create a dataset for training classifiers using `sklearn`.
    
    Parameters
    ----------
    reader
        An SNLI iterator like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    phi : feature function
        Maps trees to count dictionaries.
        
    vectorizer : `sklearn.feature_extraction.DictVectorizer`   
        If this is None, then a new `DictVectorizer` is created and
        used to turn the list of dicts created by `phi` into a 
        feature matrix. This happens when we are training.
              
        If this is not None, then it's assumed to be a `DictVectorizer` 
        and used to transform the list of dicts. This happens in 
        assessment, when we take in new instances and need to 
        featurize them as we did in training.
        
    Returns
    -------
    dict
        A dict with keys 'X' (the feature matrix), 'y' (the list of
        labels), 'vectorizer' (the `DictVectorizer`), and 
        'raw_examples' (the original tree pairs, for error analysis).
    
    """
    feat_dicts = []
    labels = []
    raw_examples = []
    for t1, t2, label in reader():
        d = phi(t1, t2)
        feat_dicts.append(d)
        labels.append(label)   
        raw_examples.append((t1, t2))
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    else:
        feat_matrix = vectorizer.transform(feat_dicts)
    return {'X': feat_matrix, 
            'y': labels, 
            'vectorizer': vectorizer, 
            'raw_examples': raw_examples}

In [235]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    `sklearn.linear.model.LogisticRegression`
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [247]:
def linear_classifier_experiment(
        train_reader,
        assess_reader,
        phi,
        train_func=fit_maxent_classifier):  
    """Runs experiments on our SNLI fragment.
    
    Parameters
    ----------
    train_reader, assess_reader
        SNLI iterators like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    phi : feature function (default: `word_overlap_phi`)
        Maps trees to count dictionaries.
        
    train_func : model wrapper (default: `fit_maxent_classifier`)
        Any function that takes a feature matrix and a label list
        as its values and returns a fitted model with a `predict`
        function that operates on feature matrices.
    
    Returns
    -------
    str
        A formatted `classification_report` from `sklearn`.
        
    """
    train = build_linear_classifier_dataset(train_reader, phi)
    assess = build_linear_classifier_dataset(assess_reader, phi, vectorizer=train['vectorizer'])
    print 1

    mod = fit_maxent_classifier(train['X'], train['y'])
    predictions = mod.predict(assess['X'])
    return classification_report(assess['y'], predictions)

In [250]:
print(linear_classifier_experiment(train_reader,dev_reader,word_cross_product_phi))

1


KeyboardInterrupt: 