In [74]:
from __future__ import division # no need for python3, but just in case used w/ python2

import sys
import time
from svector import svector

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())

def make_vector(words, bias=True):
    v = svector()
    for word in words:
        v[word] += 1
    if bias:
        v['<bias>'] = 1
    return v
    
def test(devfile, model):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        err += label * (model.dot(make_vector(words, bias=True))) <= 0
    return err/i  # i is |D| now
            

In [75]:
import time

def train(trainfile, devfile, epochs=5):
    t = time.time()
    best_err = 1.0
    model = svector()
    cumulative_model = svector()
    updates = 0

    for it in range(1, epochs + 1):
        for i, (label, words) in enumerate(read_from(trainfile), 1):
            sent = make_vector(words, bias=True)
            activation = model.dot(sent)
            if label * activation <= 0:
                update = label * sent
                model += update
                updates += 1
            cumulative_model += model
        
        # Calculate the averaged model at the end of each epoch
        if updates > 0:
            averaged_model = cumulative_model * (1/updates)
        else:
            averaged_model = model.copy()

        # Test the averaged model on the development set
        dev_err = test(devfile, averaged_model)
        best_err = min(best_err, dev_err)
        print(f"epoch {it}, updates {updates / i:.1f}%, dev {dev_err * 100:.1f}%")

    # Final averaged model is calculated outside the loop after all epochs
    if updates > 0:
        averaged_model = cumulative_model * (1/updates)
    else:
        averaged_model = model.copy()

    print(f"best dev err {best_err * 100:.1f}%, |w|={len(model)}, time: {time.time() - t:.1f} secs")
    return averaged_model


In [76]:
trainfile = 'train.txt'
devfile = 'dev.txt'
model = train(trainfile, devfile, epochs=10)

epoch 1, updates 0.4%, dev 31.4%
epoch 2, updates 0.6%, dev 27.7%
epoch 3, updates 0.9%, dev 27.2%
epoch 4, updates 1.0%, dev 27.6%
epoch 5, updates 1.2%, dev 27.2%
epoch 6, updates 1.3%, dev 26.7%
epoch 7, updates 1.4%, dev 26.3%
epoch 8, updates 1.5%, dev 26.4%
epoch 9, updates 1.6%, dev 26.3%
epoch 10, updates 1.6%, dev 26.3%
best dev err 26.3%, |w|=15806, time: 79.4 secs


In [66]:
# Get the top 20 most positive and negative features
def print_top_features(model, top_n=20):
    # Sort the features by weight
    sorted_features = sorted(model.items(), key=lambda x: x[1], reverse=True)
    
    # Top positive features
    top_positive_features = sorted_features[:top_n]
    # Top negative features
    top_negative_features = sorted_features[-top_n:]
    
    print("Top positive features:")
    for feature, weight in top_positive_features:
        print(f"{feature}")
    
    print("\nTop negative features:")
    for feature, weight in top_negative_features[::-1]:
        print(f"{feature}")
print_top_features(model)


Top positive features:
engrossing
triumph
unexpected
rare
provides
french
skin
treat
pulls
culture
cinema
dots
wonderful
refreshingly
open
powerful
delightful
imax
smarter
flaws

Top negative features:
boring
generic
dull
badly
routine
fails
ill
too
instead
tv
attempts
unless
incoherent
neither
flat
seagal
problem
scattered
worst
suffers


In [67]:
def find_misclassified(devfile, model):
    misclassified_as_positive = []
    misclassified_as_negative = []

    for label, words in read_from(devfile):
        features = make_vector(words, bias=True)
        score = model.dot(features)
        prediction = 1 if score > 0 else -1

        # For negative -> positive
        if label == -1 and prediction == 1:
            misclassified_as_positive.append((score, words))

        # For positive -> negative
        elif label == 1 and prediction == -1:
            misclassified_as_negative.append((score, words))

    # Sort by confidence (absolute score)
    misclassified_as_positive.sort(key=lambda x: abs(x[0]), reverse=True)
    misclassified_as_negative.sort(key=lambda x: abs(x[0]), reverse=True)

    return misclassified_as_positive[:5], misclassified_as_negative[:5]

misclassified_positive, misclassified_negative = find_misclassified(devfile, model)

print("Negative examples that are strongly predicted as positive:")
for score, words in misclassified_positive:
    print(f"Review: {' '.join(words)}")
    print(f"\n")

print("\nPositive examples that are strongly predicted as negative:")
for score, words in misclassified_negative:
    print(f"Review: {' '.join(words)}")
    print(f"\n")


Negative examples that are strongly predicted as positive:
Review: ` in this poor remake of such a well loved classic , parker exposes the limitations of his skill and the basic flaws in his vision '


Review: how much you are moved by the emotional tumult of fran ois and mich le 's relationship depends a lot on how interesting and likable you find them


Review: bravo reveals the true intent of her film by carefully selecting interview subjects who will construct a portrait of castro so predominantly charitable it can only be seen as propaganda


Review: mr wollter and ms seldhal give strong and convincing performances , but neither reaches into the deepest recesses of the character to unearth the quaking essence of passion , grief and fear


Review: an atonal estrogen opera that demonizes feminism while gifting the most sympathetic male of the piece with a nice vomit bath at his wedding



Positive examples that are strongly predicted as negative:
Review: the thing about guys like ev

In [90]:
from collections import Counter
# Filter out the one-count words
def build_vocabulary(trainfile):
    word_counts = Counter()
    for _, words in read_from(trainfile):
        word_counts.update(words)
    return set(word for word, count in word_counts.items() if count > 1)

def make_vector(words, vocabulary, bias=True):
    v = svector()
    for word in words:
        if word in vocabulary:
            v[word] += 1
    if bias:
        v['<bias>'] = 1
    return v
    
def test(devfile, model, vocabulary):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        # Only use words that are in the vocabulary
        err += label * (model.dot(make_vector(words, vocabulary, bias=True))) <= 0
    return err/i  # i is |D| now


In [91]:
import time

def train_neglecting_one_count_word(trainfile, devfile, epochs=10):
    vocabulary = build_vocabulary(trainfile)  # Assume this excludes one-count words
    t = time.time()
    best_err = 1.0
    model = svector()
    cumulative_model = svector()
    updates = 0

    for it in range(1, epochs + 1):
        for i, (label, words) in enumerate(read_from(trainfile), 1):
            # Filter out one-count words based on the vocabulary built
            filtered_words = [word for word in words if word in vocabulary]
            sent = make_vector(filtered_words, vocabulary, bias=True)
            activation = model.dot(sent)
            if label * activation <= 0:
                updates += 1
                update = label * sent
                model += update

            # Correctly add the current model to the cumulative model
            cumulative_model += model
        
        # Correctly calculate the averaged model at the end of each epoch
        if updates > 0:
            averaged_model = cumulative_model * (1 / updates)
        else:
            averaged_model = model.copy()

        # Test the averaged model on the development set
        dev_err = test(devfile, averaged_model, vocabulary)
        best_err = min(best_err, dev_err)
        print(f"epoch {it}, updates {updates / i:.1f}%, dev {dev_err * 100:.1f}%")

    if updates > 0:
        averaged_model = cumulative_model * (1 / updates)
    else:
        averaged_model = model.copy()

    print(f"best dev err {best_err * 100:.1f}%, |w|={len(averaged_model)}, time: {time.time() - t:.1f} secs")
    return averaged_model


In [92]:
trainfile = 'train.txt'
devfile = 'dev.txt'
model = train_neglecting_one_count_word(trainfile, devfile, epochs=10)

epoch 1, updates 0.4%, dev 31.6%
epoch 2, updates 0.7%, dev 27.5%
epoch 3, updates 0.9%, dev 26.8%
epoch 4, updates 1.1%, dev 26.6%
epoch 5, updates 1.2%, dev 25.9%
epoch 6, updates 1.4%, dev 26.5%
epoch 7, updates 1.5%, dev 27.0%
epoch 8, updates 1.6%, dev 26.7%
epoch 9, updates 1.8%, dev 26.6%
epoch 10, updates 1.9%, dev 26.2%
best dev err 25.9%, |w|=8425, time: 40.7 secs


In [98]:
def predict(testfile, model, vocabulary, outputfile):
    with open(outputfile, 'w') as outfile:
        for _, words in read_from(testfile):
            # Only use words that are in the vocabulary
            words_filtered = [word for word in words if word in vocabulary]
            vector = make_vector(words_filtered, vocabulary, bias=True)
            activation = model.dot(vector)
            label = "+" if activation > 0 else "-"
            line = f"{label}\t{' '.join(words)}\n"
            outfile.write(line)
testfile = 'test.txt'
outputfile = 'test.txt.predicted'
predict(testfile, model, vocabulary, outputfile)
