# 4. Machine learning on words

In this notebook we'll learn both the weights and the vocabulary.  Using these "unigrams" is very common.

In [None]:
import pickle
import csv
import numpy as np
from collections import Counter

In [None]:
with open("data/sentiment_splits.p", "rb") as f:
    X_train, X_dev, X_test, y_train, y_dev, y_test = pickle.load(f)

In [None]:
# We're going to precalculate words of interest because this is
# a tiny dataset... but as the dataset grows, extra passes through it are 
# a bad idea
words_of_interest = Counter() # number of documents each word occurs in
for item in X_train:
    for word in set(item.split()):
        words_of_interest[word] += 1

print "Number of unique words:"
print len(words_of_interest)
print "Number of words that occur in more than 1 document:"
print len([val for val, amt in words_of_interest.items() if amt > 1])


In [None]:
# Let's create a mapping from each word in the vocabulary to its 
# index in our one-hot embedding space.
# Any words that occur only once are mapped to the special token
# "UNK" -- others are mapped according to their frequency
embeddings = {}
UNK_IDX = 0
known_words_in_vocab = 1
for word, count in words_of_interest.most_common():
    if count == 1:
        embeddings[word] = UNK_IDX
    else:
        embeddings[word] = known_words_in_vocab
        known_words_in_vocab += 1

In [None]:
# Create a function that will convert each paragraph to a vector.
def convert_to_vector(paragraph):
    representation = np.zeros(known_words_in_vocab)
    for word in paragraph.split():
        if word in embeddings:
            idx = embeddings[word]
            representation[idx] = 1
        else:
            representation[UNK_IDX] = 1
    return representation

In [None]:
# Test the embedding
convert_to_vector("The comedy by Voltaire was hysterical .")

In [None]:
def convert_dataset(dataset):
    # Convert X_train and X_dev to use the new format
    dataset_vector = np.zeros((len(dataset), known_words_in_vocab))
    for i,paragraph in enumerate(dataset):
        dataset_vector[i] = convert_to_vector(paragraph)
    return dataset_vector

X_train_vector = convert_dataset(X_train)
print X_train_vector.shape
X_dev_vector = convert_dataset(X_dev)
print X_dev_vector.shape

In [None]:
from sklearn import linear_model

clf = linear_model.LogisticRegression()
clf.fit(X_train_vector, y_train)
y_dev_hat = clf.predict(X_dev_vector)

## Evaluation

In [None]:
# Let's evaluate
# No cross-validation this round, but we can use that in the 
# future to get a sense of the variability of the method
from sklearn import metrics

print "Accuracy:"
print metrics.accuracy_score(y_dev, y_dev_hat)

print

print "Classification metrics:"
print metrics.classification_report(y_dev, y_dev_hat)

print 

print "Confusion matrix:"
print "(Rows are truth, columns are predictions)"
print metrics.confusion_matrix(y_dev, y_dev_hat)

Wow, look how much better we're doing using only unigram features!  We get more than 20% gain by using whatever words happen to be in the training set, and we're doing almost no work to get this performance.

Clearly our brains are not so good at coming up with rules compared to leveraging data...