In [1]:
import nltk
from collections import Counter
from toolz import concat

In [2]:
class Confuser:

    def __init__(self, conf_set):
        self.conf_set = conf_set

    def eval(self, test_file):
        """Run a confusable-guesser over a test corpus and report the accuracy."""
        n = 0
        r = 0
        for sentence in open(test_file):
            sentence = self.tokenize_line(sentence)
            for i, w in enumerate(sentence):
                if w in self.conf_set:
                    guess = self.guess(sentence, i)
                    if guess == sentence[i]:
                        r += 1
                    n += 1
        return r / n * 100

In [3]:
class DefaultConfuser(Confuser):

    def train(self, train_file):
        """Find the most common member of the confusion set in the training corpus."""
        unigrams = Counter(w for line in open(train_file) for w in self.tokenize_line(line) if w in conf_set)
        self.default_guess = None
        f = -1
        for g in self.conf_set:
            if unigrams[g] > f:
                self.default_guess = g
                f = unigrams[g]

    def tokenize_line(self, line):
        """Split the line into tokens."""
        return line.lower().split()

    def guess(self, line, i):
        """Always guess the most frequent member of the confusion set."""
        return self.default_guess

In [4]:
conf_set = ['their', 'there', "they're"]

In [5]:
baseline = DefaultConfuser(conf_set)
baseline.train('/home1/data/bnc_train.txt')
baseline.eval('/home1/data/brown_there.txt')

50.158375256195264

In [6]:
class BigramConfuser(Confuser):

    def train(self, train_file):
        """Count bigram and unigram frequencies in the training corpus."""
        self.bigrams = Counter(concat(nltk.bigrams(self.tokenize_line(line)) 
                                      for line in open(train_file)))
        self.unigrams = Counter()
        for (w1, w2), f in self.bigrams.items():
            self.unigrams[w1] += f

        self.default_guess = None
        f = -1
        for g in self.conf_set:
            if self.unigrams[g] > f:
                self.default_guess = g
                f = self.unigrams[g]

    def tokenize_line(self, line):
        """Divide the input line into tokens, with start and end sentence markers."""
        return ['<s>'] + line.lower().split() + ['</s>']

    def guess(self, sentence, i):
        """Find the guess that maximizes the prob of the sentence"""
        best_p = 0
        best_guess = self.default_guess
        for guess in self.conf_set:
            p = self.prob(guess, sentence, i)
            if p > best_p:
                best_p = p
                best_guess = guess
        return best_guess
    
    def prob(self, guess, sentence, i):
        """Calc the prob of a guess in context."""
        try:
            p = self.bigrams[(sentence[i-1], guess)] / self.unigrams[sentence[i-1]]
            p = p * self.bigrams[(guess, sentence[i+1])] / self.unigrams[guess]
        except ZeroDivisionError:
            p = 0.0
        return p

In [7]:
bigram = BigramConfuser(conf_set)
bigram.train('/home1/data/bnc_train.txt')
bigram.eval('/home1/data/brown_there.txt')

88.55971678777715

In [8]:
class BigramAddOneConfuser(BigramConfuser):

    def prob(self, guess, sentence, i):
        """Calc the prob of a guess in context."""
        V = len(self.bigrams)
        try:
            p = (self.bigrams[(sentence[i-1], guess)] + 1) / (self.unigrams[sentence[i-1]] + V)
            p = p * (self.bigrams[(guess, sentence[i+1])] + 1) / (self.unigrams[guess] + V)
        except ZeroDivisionError:
            p = 0.0
        return p


In [9]:
bigram_addone = BigramAddOneConfuser(conf_set)
bigram_addone.train('/home1/data/bnc_train.txt')
bigram_addone.eval('/home1/data/brown_there.txt')

95.08105086640582

In [10]:
class TrigramConfuser(Confuser):
    def train(self, train_file):
        """Count trigram, bigram and unigram frequencies in the training corpus."""
        self.trigrams = Counter(concat(nltk.trigrams(self.tokenize_line(line)) 
                                      for line in open(train_file)))
        
        self.bigrams = Counter(concat(nltk.bigrams(self.tokenize_line(line)) 
                                      for line in open(train_file)))
        self.unigrams = Counter()
        for (w1, w2), f in self.bigrams.items():
            self.unigrams[w1] += f

        self.default_guess = None
        f = -1
        for g in self.conf_set:
            if self.unigrams[g] > f:
                self.default_guess = g
                f = self.unigrams[g]

    def tokenize_line(self, line):
        """Divide the input line into tokens, with start and end sentence markers."""
        return ['<s>'] + line.lower().split() + ['</s>']

    def guess(self, sentence, i):
        """Find the guess that maximizes the prob of the sentence"""
        best_p = 0
        best_guess = self.default_guess
        for guess in self.conf_set:
            p = self.prob(guess, sentence, i)
            if p > best_p:
                best_p = p
                best_guess = guess
        return best_guess
    
    def prob(self, guess, sentence, i):
        """Calc the prob of a guess in context."""
        try:
            p = self.trigrams[(sentence[i-2], sentence[i-1], guess)] / self.bigrams[sentence[i-2], sentence[i-1]]
            p = p * self.trigrams[(sentence[i-1], guess, sentence[i+1])] / self.bigrams[sentence[i-1], guess]
            p = p * self.trigrams[(guess, sentence[i+1], sentence[i+2])] / self.bigrams[guess, sentence[i+1]]
        except ZeroDivisionError:
            p = 0.0
        except IndexError:
            p = 0.0
        return p


In [11]:
trigram = TrigramConfuser(conf_set)
trigram.train('/home1/data/bnc_train.txt')
trigram.eval('/home1/data/brown_there.txt')

55.766722563815904

In [12]:
class TrigramAddOneConfuser(TrigramConfuser):
    def prob(self, guess, sentence, i):
        """Calc the prob of a guess in context."""
        V = len(self.trigrams)
        try:
            p = (self.trigrams[(sentence[i-2], sentence[i-1], guess)] + 1) / (self.bigrams[sentence[i-2], sentence[i-1]] + V)
            p = p * (self.trigrams[(sentence[i-1], guess, sentence[i+1])] + 1) / (self.bigrams[sentence[i-1], guess] + V)
            p = p * (self.trigrams[(guess, sentence[i+1], sentence[i+2])] + 1) / (self.bigrams[guess, sentence[i+1]] + V)
        except ZeroDivisionError:
            p = 0.0
        except IndexError:
            p = 0.0
        return p

In [13]:
trigram_addone = TrigramAddOneConfuser(conf_set)
trigram_addone.train('/home1/data/bnc_train.txt')
trigram_addone.eval('/home1/data/brown_there.txt')

82.80231041550215