In [1]:
import os
import re
import sys
import pickle
import numpy as np
import itertools
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import utils
from nltk.tree import Tree
from nli_rnn import ClassifierRNN
import tensorflow as tf

  from ._min_spanning_tree import minimum_spanning_tree
  from ._graph_tools import csgraph_to_dense, csgraph_from_dense,\
  from ._traversal import connected_components
  from . import _hashing
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from . import vonmises_cython
  from . import vonmises_cython
  from ._rank import rankdata, tiecorrect
  from ._rank import rankdata, tiecorrect
  from . import _ppoly
  from . import _ppoly
  from .ckdtree import *
  from .ckdtree import *
  from .qhull import *
  from .qhull import *
  from ..utils import array2d, arrayfuncs, as_float_array, check_arrays
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
  from . import cd_fast
  from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
  from . import libsvm, liblinear
  from . import libsvm_sparse
  from ._random import sample_without_replacement

In [74]:
class TfNeuralNetwork:
    """Fairly exact reproduction of `ShallowNeuralNetwork` in
    TensorFlow, differing only in some details of optimization."""
    def __init__(self, hidden_dim=40, maxiter=100, eta=0.05):
        """All the parameters are set as attributes.
        
        Parameters
        ----------
        hidden_dim : int (default: 40)
            Dimensionality of the hidden layer.                   

        maxiter : int default: 100)
            Maximum number of training epochs.
            
        eta : float (default: 0.05)
            Learning rate.                 
        
        """
        self.input_dim = None
        self.hidden_dim = hidden_dim
        self.output_dim = None
        self.maxiter = maxiter
        self.eta = eta
        self.loss_hist = [] 

    def get_hist(self):
        return self.loss_hist
                
    def fit(self, training_data):
        """The training algorithm. 
        
        Parameters
        ----------
        training_data : list
            A list of (example, label) pairs, where `example`
            and `label` are both np.array instances.
        
        Attributes
        ----------
        self.sess : the TensorFlow session
        self.x : place holder for input data
        self.h : the hidden layer
        self.y : the output layer -- more like the full model here.
        self.W1 : dense weight connection from self.x to self.h
        self.b1 : bias
        self.W2 : dense weight connection from self.h to self.y
        self.b2 : bias
        self.y_ : placeholder for training data
                
        """
        self.sess = tf.InteractiveSession()
        # Dimensions determined by the data:
        self.input_dim = len(training_data[0][0])
        self.output_dim = len(training_data[0][1])
        # Network initialization. For the inputs x, None in the first
        # dimension allows us to train and evaluate on datasets
        # of different size.
        self.x = tf.placeholder(tf.float32, [None, self.input_dim])
        self.W1 = tf.Variable(tf.random_normal([self.input_dim, self.hidden_dim]))
        self.b1 = tf.Variable(tf.random_normal([self.hidden_dim]))
        self.W2 = tf.Variable(tf.random_normal([self.hidden_dim, self.output_dim]))
        self.b2 = tf.Variable(tf.random_normal([self.output_dim]))
        # Network structure. As before, we use tanh for both 
        # layers. This is not strictly necessary, and TensorFlow
        # makes it easier to try different combinations.
        self.h = tf.nn.tanh(tf.matmul(self.x, self.W1) + self.b1)    
        self.y = tf.nn.softmax(tf.matmul(self.h, self.W2) + self.b2)        
        # A place holder for the true labels. None in the first
        # dimension allows us to train and evaluate on datasets
        # of different size.
        self.y_ = tf.placeholder(tf.float32, [None, self.output_dim])
        # This defines the objective as one of reducing the 
        # one-half squared total error. This could easily 
        # be made into a user-supplied parameter to facilitate
        # exploration of other costs. See
        # https://www.tensorflow.org/versions/r0.7/api_docs/python/math_ops.html#reduction
#        cost = tf.reduce_sum(0.5 * (self.y_ - self.y)**tf.constant(2.0))

        cost = tf.reduce_mean(-tf.reduce_sum(self.y_ * tf.log(self.y), reduction_indices=[1]))
        # Simple GradientDescent (as opposed to the stochastic version
        # used by `ShallowNeuralNetwork`). For more options, see
        # https://www.tensorflow.org/versions/r0.7/api_docs/python/train.html#optimizers
        self.optimizer = tf.train.GradientDescentOptimizer(self.eta).minimize(cost)
        # TF session initialization:   
        init = tf.initialize_all_variables()
        self.sess.run(init)
        # Train (for larger datasets, the epochs should be batched):
        x, y_ = zip(*training_data)
        for iteration in range(self.maxiter):
            for bat in range(len(x)/100):
                #print bat
                x_bat = x[bat*100:bat*100 + 100]
                y_bat = y_[bat*100:bat*100 + 100]
                self.sess.run(self.optimizer, feed_dict={self.x: x_bat, self.y_: y_bat})
            if iteration % 20 == 0:
#                    print 'Iteration: ' + str(iteration)
                self.get_accuracy(training_data)
            #self.loss_hist.append(loss)

    def predict(self, ex):
        """
        Prediction for `ex`. This runs the model (forward propagation with
        self.x replaced by the single example `ex`).

        Parameters
        ----------
        ex : np.array
          Must be featurized as the training data were.

        Returns
        -------
        np.array
            The predicted outputs, dimension self.output_dim. TensorFlow
            assumes self.x is a list of examples and so returns a list of
            predictions. Since we're classifying just one, we return the
            list's only member.
            
        """
        return self.sess.run(self.y, feed_dict={self.x: [ex]})[0]
    
    def get_accuracy(self, test_data):
        x, y_ = zip(*test_data)
        correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.y_,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        print self.sess.run(accuracy, feed_dict={self.x: x, self.y_: y_})

In [3]:
import json
devs = []
with open('snli_1.0/snli_1.0_dev.jsonl') as f:
    for line in f:
        j = json.loads(line)
        devs.append(j)
        
trains = []
with open('snli_1.0/snli_1.0_train.jsonl') as f:
    for line in f:
        j = json.loads(line)
        trains.append(j)



In [4]:
snli = {}
snli['dev'] = devs
snli['train'] = trains
print len(trains)
print len(devs)
print trains[100]['sentence1']

550152
10000
A woman is walking across the street eating a banana, while a man is following with his briefcase.


In [5]:
vocabulary = set()
for example in snli['train'] + snli['dev']:
    parse1 = example['sentence1_binary_parse'].replace('(', ' ').replace(')', ' ').split()
    for word in parse1:
        vocabulary.add(word)
    parse1 = example['sentence2_binary_parse'].replace('(', ' ').replace(')', ' ').split()
    for word in parse1:
        vocabulary.add(word)
print len(vocabulary)    

42803


In [6]:
glove_home = '../cs224uold2/glove.6B'
GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt'))
def build_glove_embedding(vocab):
    glove_vocab = {}
    for x in vocab:
        if x in GLOVE:
            glove_vocab[x] = np.array(GLOVE[x])
        else:
            glove_vocab[x] = utils.randvec(50)
#    return np.array([GLOVE[x] if x in GLOVE else utils.randvec(50) for x in vocab ])
    return glove_vocab

glove_embedding = build_glove_embedding(vocabulary)
print len(glove_embedding)

42803


In [7]:
def get_class_ind(c):
    if c == 'neutral':
        return np.array([0.0, 1.0, 0.0])
    elif c == 'entailment':
        return np.array([0.0, 0.0, 1.0])
    elif c == 'contradiction':
        return np.array([1.0, 0.0, 0.0])
    else:
        print c
        raise Exception('WTF just happened')
        

In [12]:
dataset = {}
def build_dataset(reader):
    dataset = []
    for (s1, s2, gold_label) in reader():
        #print s1, s2, gold_label
        dataset.append((sum_sentence(s1) - sum_sentence(s2), get_class_ind(gold_label)))
    return dataset
                       
dataset['train'] = build_dataset(train_reader)
print len(dataset['train'])

550152


In [13]:
dataset['dev'] = build_dataset(dev_reader)

In [11]:
def sum_sentence(s):
    result = np.zeros(glove_embedding['the'].shape)
    for c in s.split():
        result += glove_embedding[c]
    return result

print sum_sentence('the boy is a girl')

[  0.58539      2.28632     -2.32719     -0.51508      4.42605      3.120233
  -2.7889461   -0.36035      0.70848577   0.060274     1.11412     -0.35686
  -0.52587      0.88838      2.3264605   -0.412492    -1.41214      2.28166
  -1.979853     0.67196     -1.879842     2.67322     -0.429915     2.11549
   0.71129    -10.1074      -2.90883      1.742431     0.841326    -1.734354
  16.09        -2.30344     -1.75328      0.21298      1.55362713
   0.9529049    0.76468     -1.44993      1.172401    -3.168863    -1.046373
   0.66836     -1.945562    -0.846426     1.189435    -1.37259      0.8858249
  -3.148662     1.29461     -0.073581  ]


## This is with the difference feature function

In [63]:
for eta in [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1]:
    print 'Learning rate: ', eta
    baseline_tfnetwork = TfNeuralNetwork(maxiter=500, hidden_dim=100, eta=eta)
#print dataset['train'][0]
    baseline_tfnetwork.fit(dataset['train'][:100000])

    baseline_tfnetwork.get_accuracy(dataset['dev'][:10000])



Learning rate:  0.001
0.34209
0.38099
0.4067
0.4177
0.42543
0.43207
0.43776
0.44366
0.44854
0.45085
0.45302
0.45465
0.45623
0.45708
0.4577
0.45831
0.45853
0.45861
0.45955
0.45952
0.46014
0.46055
0.46077
0.46091
0.46141
0.4582
Learning rate:  0.003
0.35233
0.4229
0.44823
0.45689
0.45885
0.46072
0.46164
0.46239
0.46317
0.46419
0.46517
0.4657
0.46661
0.46731
0.46777
0.46848
0.46939
0.46966
0.47068
0.47136
0.47202
0.47287
0.47374
0.47395
0.47465
0.4673
Learning rate:  0.01
0.38221
0.45831
0.46316
0.46731
0.47091
0.47512
0.4773
0.47927
0.48143
0.4838
0.48565
0.48738
0.48881
0.49051
0.49266
0.49322
0.49472
0.49626
0.49788
0.49896
0.49965
0.50131
0.50272
0.5043
0.50549
0.4998
Learning rate:  0.03
0.38267
0.46596
0.47825
0.48445
0.48954
0.49415
0.49967
0.50337
0.50561
0.50876
0.51147
0.51442
0.51851
0.51906
0.52073
0.52254
0.52686
0.52851
0.52999
0.5313
0.53198
0.5332
0.53465
0.5368
0.53685
0.511
Learning rate:  0.1
0.41931
0.46984
0.49016
0.50369
0.50963
0.51413
0.51937
0.51763
0.52197
0.5206

In [64]:
baseline_tfnetwork = TfNeuralNetwork(maxiter=500, hidden_dim=100, eta=0.3)

baseline_tfnetwork.fit(dataset['train'][:500000])

baseline_tfnetwork.get_accuracy(dataset['dev'][:10000])

0.45137
0.512754
0.52163
0.527848
0.527446
0.530264
0.53279
0.531286
0.53502
0.533058
0.53417
0.533536
0.532442
0.53608
0.535532
0.537636
0.536706
0.53728
0.537458
0.532756
0.540312
0.533914
0.538384
0.538374
0.536638
0.5359


In [71]:
baseline_tfnetwork = TfNeuralNetwork(maxiter=200, hidden_dim=50, eta=0.3)

baseline_tfnetwork.fit(dataset['train'][:500000])

baseline_tfnetwork.get_accuracy(dataset['dev'][:10000])

0.452196
0.48023
0.490888
0.502426
0.507742
0.510706
0.515684
0.515982
0.515826
0.52047
0.524506
0.520624
0.524036
0.526338
0.526836
0.52515
0.528734
0.52045
0.531844
0.531632
0.530328
0.532806
0.5316
0.531394
0.532868
0.53118
0.533836
0.535378
0.535306
0.532922
0.532522
0.53598
0.534452
0.535654
0.534928
0.53407
0.532534
0.53454
0.535584
0.535726
0.533238
0.534248
0.534026
0.534706
0.537888
0.539582
0.534348
0.534058
0.537516
0.536734
0.535514
0.534788
0.535224
0.534752
0.534562
0.536704
0.536182
0.538338
0.536028
0.539184
0.538316
0.535416
0.538148
0.539262
0.537866
0.536548
0.537444
0.537764
0.53835
0.537948
0.536192
0.539238
0.538582
0.536334
0.539568
0.53638
0.539832
0.538068
0.539262
0.538904
0.537722
0.539344
0.537792
0.53723
0.539128
0.536024
0.539884
0.536572
0.536956
0.537526
0.536762
0.532646
0.538626
0.538774
0.537812
0.536294
0.536594
0.53673
0.539962
0.541066
0.5333


In [73]:
dataset = {}
def build_dataset(reader):
    dataset = []
    for (s1, s2, gold_label) in reader():
        #print s1, s2, gold_label
        dataset.append((np.concatenate((sum_sentence(s1),sum_sentence(s2))), get_class_ind(gold_label)))
    return dataset
                       
dataset['train'] = build_dataset(train_reader)
print len(dataset['train'])
print dataset['train'][0][0].shape
dataset['dev'] = build_dataset(dev_reader)

550152
(100,)


In [78]:
baseline_tfnetwork = TfNeuralNetwork(maxiter=500, hidden_dim=100, eta=0.03)

baseline_tfnetwork.fit(dataset['train'][:500000])

baseline_tfnetwork.get_accuracy(dataset['dev'][:10000])

0.39277
0.47883
0.50646
0.513506
0.522418
0.528482
0.535132
0.538664
0.542118
0.545706
0.54833
0.55167
0.553708
0.55413
0.556608
0.556438
0.557914
0.56065
0.561768
0.56359
0.564596
0.565962
0.56831
0.569742
0.570718
0.57


In [80]:
baseline_tfnetwork = TfNeuralNetwork(maxiter=1500, hidden_dim=100, eta=0.03)

baseline_tfnetwork.fit(dataset['train'][:500000])

baseline_tfnetwork.get_accuracy(dataset['dev'][:10000])

0.382264
0.449592
0.494022
0.510698
0.521378
0.531202
0.53661
0.537982
0.546754
0.550888
0.552598
0.55207
0.553916
0.558788
0.55859
0.56077
0.562576
0.564492
0.566138
0.569358
0.56877
0.571264
0.57104
0.572082
0.574118
0.57512
0.575934
0.576546
0.577794
0.577962
0.577126
0.577384
0.57799
0.578698
0.57836
0.580452
0.57923
0.57938
0.581084
0.579874
0.57982
0.581392
0.579782
0.580666
0.580502
0.58144
0.581786
0.580994
0.580454
0.577694
0.580196
0.58103
0.582168
0.58024
0.580028
0.58104
0.5806
0.581546
0.58024
0.579312
0.58073
0.580946
0.580582
0.581316
0.582918
0.581982
0.583378
0.583212
0.582436
0.582498
0.582336
0.58108
0.58409
0.58369
0.582508
0.5883


In [124]:
# import string
import nltk
# exclude = set(string.punctuation)

# an alternative method: still doesn't seem to give enough vocab. below (somewhere) i just take the vocab from the
# build_rnn_dataset output
vc = nltk.word_tokenize(' '.join([sentence['sentence1']+ ' '+sentence['sentence2'] for sentence in (snli['train'] + snli['dev'])]))

# vc = set()
# for sentence in snli['train'][:10000]:
#     things = sentence['sentence1']
# #     for thing in things:
# #         thing = ''.join(ch for ch in thing if ch not in exclude)
# #         vc.add(thing)
#     
# #     for thing in nltk.word_tokenize(things):
# #         vc.add(thing)
# # #     for thing in things:
# # #         thing = ''.join(ch for ch in thing if ch not in exclude)
# # #         vc.add(thing)
# #     for thing in nltk.word_tokenize(things):
# #         vc.add(thing)
        

snli['vocab'] = list(set(vc))


In [145]:

print (str2tree(snli['train'][0]['sentence2_binary_parse'])).subtrees()[0]

TypeError: 'generator' object has no attribute '__getitem__'

In [9]:
def snli_reader(sample):
    """Reader for SNLI data. `sample` just needs to be an iterator over
    the SNLI JSONL files. For this notebook, it will always be 
    `snli_sample`, but, for example, the following should work for the 
    corpus files:
    
    import json    
    def sample(src_filename):
        for line in open(src_filename):
            yield json.loads(line)
    
    Yields
    ------
    tuple
        (tree1, tree2, label), where the trees are from `str2tree` and
        label is in `LABELS` above.
      
    """
    for d in sample:
        if d['gold_label'] == '-':
            d['gold_label'] = d['annotator_labels'][0]
        yield (d['sentence1_binary_parse'].replace('(', ' ').replace(')', ' '), 
               d['sentence2_binary_parse'].replace('(', ' ').replace(')', ' '),
               d['gold_label'])
        
def train_reader():
    """Convenience function for reading just the training data."""
    return snli_reader(snli['train'])

def dev_reader():
    """Convenience function for reading just the dev data."""
    return snli_reader(snli['dev'])


In [126]:
def str2tree(s):
    """Map str `s` to an `nltk.tree.Tree` instance. The assumption is that 
    `s` represents a standard Penn-style tree."""
    return Tree.fromstring(s)

In [127]:
LABELS = ['contradiction', 'entailment', 'neutral']

In [128]:
def build_rnn_dataset(reader):
    """Build RNN datasets.
    
    Parameters
    ----------
    reader
        SNLI iterator like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    Returns
    -------
    list of tuples
        The first member of each tuple is a list of strings (the
        concatenated leaves) and the second is an np.array 
        (dimension 3) with a single 1 for the true class and 0s
        in the other two positions
       
    """    
    dataset = []
    for (t1, t2, label) in reader():
        seq = t1.leaves() + t2.leaves()
        y_ = np.zeros(3)
        if label == '-':
            label = 'neutral'
        y_[LABELS.index(label)] = 1.0
        dataset.append((seq, y_))
    return dataset

In [130]:
def rnn_model_evaluation(mod, assess, labels=LABELS):
    """Asssess a trained `ClassifierRNN`.
    
    Parameters
    ----------
    mod : `ClassifierRNN`
        Should be a model trained on data in the same format as
        `assess`.
    
    assess : list
        A list of (seq, label) pairs, where seq is a sequence of
        words and label is a one-hot vector giving the label.        
    
    """    
    # Assessment:
    gold = []
    predictions = []    
    for seq, y_ in assess:
        # The gold labels are vectors. Get the index of the single 1
        # and look up its string in `LABELS`:
        gold.append(labels[np.argmax(y_)])
        # `predict` returns the index of the highest score.
        p = mod.predict(seq) 
        predictions.append(labels[p])
    # Report:
    return classification_report(gold, predictions)

In [264]:
def rnn_experiment(
        train_inp,
        dev_inp,
        vocab, 
        embedding, 
        hidden_dim, 
        eta, 
        maxiter):
    """Classifier RNN experiments.
    
    Parameters
    ----------
    vocab : list of str
        Must contain every word we'll encounter in training or assessment.
        
    embedding : np.array
        Embedding matrix for `vocab`. The ith row gives the input 
        representation for the ith member of vocab. Thus, `embedding`
        must have the same row count as the length of vocab. Its
        columns can be any length. (That is, the input word 
        representations can be any length.)
        
    hidden_dim : int (default: 10)
        Dimensionality of the hidden representations. This is a
        parameter to `ClassifierRNN`.
        
    eta : float (default: 0.05)
        The learning rate. This is a parameter to `ClassifierRNN`.       
        
    maxiter : int (default: 10)
        Maximum number of training epochs. This is a parameter 
        to `ClassifierRNN`.       
        
    Returns
    -------
    str
        A formatted `sklearn` `classification_report`.
    
    """
    # Training:
    train = build_rnn_dataset(train_inp)       
    mod = ClassifierRNN(
        vocab, 
        embedding, 
        hidden_dim=hidden_dim, 
        eta=eta,
        maxiter=maxiter)
    mod.fit(train)    
    # Assessment:
    assess = build_rnn_dataset(dev_inp) 
    return rnn_model_evaluation(mod, assess)

In [171]:
vc = set()

for x in build_rnn_dataset(train_reader):
    vc = vc.union(set(x[0]))
        
for x in build_rnn_dataset(dev_reader):
    vc = vc.union(set(x[0]))
        
snli['vocab'] = list(vc)
# vc = set()
# print [x[0] for x in build_rnn_dataset(train_reader)[:5]]
# a = [[u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'training', u'his', u'horse', u'for', u'a', u'competition', u'.'], [u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'at', u'a', u'diner', u',', u'ordering', u'an', u'omelette', u'.'], [u'A', u'person', u'on', u'a', u'horse', u'jumps', u'over', u'a', u'broken', u'down', u'airplane', u'.', u'A', u'person', u'is', u'outdoors', u',', u'on', u'a', u'horse', u'.'], [u'Children', u'smiling', u'and', u'waving', u'at', u'camera', u'They', u'are', u'smiling', u'at', u'their', u'parents'], [u'Children', u'smiling', u'and', u'waving', u'at', u'camera', u'There', u'are', u'children', u'present']]
# for x in a:
#     print x
#     vc = vc.union(x)
# print vc


In [172]:
vocab = snli['vocab']
# for seq,labels in training_data:
#     print 1
#     break
# vocab = snli_sample['vocab']
# Random embeddings of dimension 10:
randvec_embedding = np.array([utils.randvec(10) for w in vocab])

# A small network, trained for just a few epochs to see how things look:


Finished epoch 1 of 1; error is 1.2139977093

             precision    recall  f1-score   support

contradiction       0.33      0.50      0.40      3278
 entailment       0.33      0.50      0.40      3329
    neutral       0.00      0.00      0.00      3393

avg / total       0.22      0.33      0.26     10000






In [257]:
def train_reader_sample():
    return snli_reader(snli_sample['train'])
def dev_reader_sample(): 
    return snli_reader(snli_sample['dev'])


3000


In [260]:
glove_home = '/Users/reuben/Documents/glove.6B'
GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt'))


def build_glove_embedding(vocab):
    return np.array([GLOVE[x] if x in GLOVE else utils.randvec(50) for x in vocab ])

print 1
glove_embedding = build_glove_embedding(snli_sample['vocab'])

1
1


In [268]:
print rnn_experiment(train_reader,
                     dev_reader,
                     snli['vocab'], 
                     build_glove_embedding(snli['vocab']), 
                     hidden_dim=10, 
                     eta=0.01, 
                     maxiter=10)

Finished epoch 10 of 10; error is 1.11585145787

             precision    recall  f1-score   support

contradiction       0.50      0.01      0.01      3278
 entailment       0.33      0.98      0.50      3329
    neutral       0.46      0.03      0.06      3393

avg / total       0.43      0.34      0.19     10000






In [233]:
def word_cross_product_phi(t1, t2):
    """Basis for cross-product features. This tends to produce pretty 
    dense representations.
    
    Parameters
    ----------
    t1, t2 : `nltk.tree.Tree`
        As given by `str2tree`.
        
    Returns
    -------
    defaultdict
        Maps each (w1, w2) in the cross-product of `t1.leaves()` and 
        `t2.leaves()` to its count. This is a multi-set cross-product
        (repetitions matter).
    
    """
    return Counter([(w1, w2) for w1, w2 in itertools.product(t1.leaves(), t2.leaves())])

In [234]:
def build_linear_classifier_dataset(
        reader,
        phi, 
        vectorizer=None):
    """Create a dataset for training classifiers using `sklearn`.
    
    Parameters
    ----------
    reader
        An SNLI iterator like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    phi : feature function
        Maps trees to count dictionaries.
        
    vectorizer : `sklearn.feature_extraction.DictVectorizer`   
        If this is None, then a new `DictVectorizer` is created and
        used to turn the list of dicts created by `phi` into a 
        feature matrix. This happens when we are training.
              
        If this is not None, then it's assumed to be a `DictVectorizer` 
        and used to transform the list of dicts. This happens in 
        assessment, when we take in new instances and need to 
        featurize them as we did in training.
        
    Returns
    -------
    dict
        A dict with keys 'X' (the feature matrix), 'y' (the list of
        labels), 'vectorizer' (the `DictVectorizer`), and 
        'raw_examples' (the original tree pairs, for error analysis).
    
    """
    feat_dicts = []
    labels = []
    raw_examples = []
    for t1, t2, label in reader():
        d = phi(t1, t2)
        feat_dicts.append(d)
        labels.append(label)   
        raw_examples.append((t1, t2))
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    else:
        feat_matrix = vectorizer.transform(feat_dicts)
    return {'X': feat_matrix, 
            'y': labels, 
            'vectorizer': vectorizer, 
            'raw_examples': raw_examples}

In [235]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    `sklearn.linear.model.LogisticRegression`
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [247]:
def linear_classifier_experiment(
        train_reader,
        assess_reader,
        phi,
        train_func=fit_maxent_classifier):  
    """Runs experiments on our SNLI fragment.
    
    Parameters
    ----------
    train_reader, assess_reader
        SNLI iterators like `snli_reader` above. Just needs to
        yield (tree, tree, label) triples.
        
    phi : feature function (default: `word_overlap_phi`)
        Maps trees to count dictionaries.
        
    train_func : model wrapper (default: `fit_maxent_classifier`)
        Any function that takes a feature matrix and a label list
        as its values and returns a fitted model with a `predict`
        function that operates on feature matrices.
    
    Returns
    -------
    str
        A formatted `classification_report` from `sklearn`.
        
    """
    train = build_linear_classifier_dataset(train_reader, phi)
    assess = build_linear_classifier_dataset(assess_reader, phi, vectorizer=train['vectorizer'])
    print 1

    mod = fit_maxent_classifier(train['X'], train['y'])
    predictions = mod.predict(assess['X'])
    return classification_report(assess['y'], predictions)

In [250]:
print(linear_classifier_experiment(train_reader,dev_reader,word_cross_product_phi))

1


KeyboardInterrupt: 