In [2]:
import requests
import collections
import math
import re

In [1]:
# Computes the global vocabulary.
urls = [
    "https://storage.googleapis.com/research-share/texts/t10.txt",
    "https://storage.googleapis.com/research-share/texts/t1.txt",
    "https://storage.googleapis.com/research-share/texts/t2.txt",
    "https://storage.googleapis.com/research-share/texts/t3.txt",
    "https://storage.googleapis.com/research-share/texts/t4.txt",
    "https://storage.googleapis.com/research-share/texts/t5.txt",
    "https://storage.googleapis.com/research-share/texts/t6.txt",
    "https://storage.googleapis.com/research-share/texts/t7.txt",
    "https://storage.googleapis.com/research-share/texts/t8.txt",
    "https://storage.googleapis.com/research-share/texts/t9.txt",
]

# We compute the book tokens once and forall, as this is an expensive operation.
book_tokens = []
all_tokens = []
for u in urls:
    text = requests.get(u).text
    tokens = re.findall(r'[a-zA-Z]+', text)
    book_tokens.append(tokens)
    all_tokens.extend(tokens)

NameError: name 'requests' is not defined

We remind you of the main formulas.

For a word $w$, let:
* $P(w)$ be the probability of word $w$ in the general vocabulary.
* $K_w$ be the number of times the author uses $w$ in the author's text.
* $K$ be the number of words in the author's text.
* $\alpha$ be the Laplace smooting coefficient.

Then, the probability that the author generates word $w$ is given by:

$$\theta_w = \frac{K_w + \alpha P(w)}{K + \alpha}$$

Let $M_w$ the number of times word $w$ appears in an unknown body of text, and let $M = \sum_w M_w$.
The log-likelyhood of the author generating the unknown body of text is then:

$$\sum_w M_w \log \theta_w$$

With these formulas, you should be able to complete the class below.

In [None]:
class Author(object):

    def __init__(self, name, tokens, alpha=100, vocabulary_probabilities=None):
        """Initializes the author with the given name and text.
        Also tokenizes the text with spaCy, and stores the list of tokens.
        @param name: name of the author.
        @param text: tokenized text constituting writing sample for the author.
        @param alpha: value to be used in the Laplace smoothing function.
        @param vocabulary_probabilities: dictionary mapping each word to its probability
            in the vocabulary (general vocabulary, not by the author).
        """
        self.name = name
        self.text = text
        self.alpha = alpha
        self.tokens = tokens
        self.num_tokens = len(self.tokens)
        # We also build for you a dictionary mapping each word to its number of occurrences in the
        self.word_frequencies = {w: c for w, c in collections.Counter(self.tokens).items()}
        # We store the underlying word probabilities in the dictionary.
        self.vocabulary_word_probabilities = vocabulary_probabilities if vocabulary_probabilities is not None else {}


    def theta(self, word):
        """Returns the probability that an author generates a given word.
        This is the theta in the above mathematical formula.
        """
        kw = self.word_frequencies[word] if word in self.word_frequencies else 0
        pw = self.vocabulary_word_probabilities[word] if word in self.vocabulary_word_probabilities else 0
        return (kw + self.alpha * pw) / (self.num_tokens + self.alpha)

    def author_likelyhood(self, token_list):
        """Returns the log-likelyhood of the author having written the given text.
        The given text is given purely as a sequence of tokens.
        """
        total = 0
        given_text_word_frequencies = {w: c for w, c in collections.Counter(token_list).items()}
        for w in given_text_word_frequencies:
          total += (given_text_word_frequencies[w] * math.log(self.theta(w)))
        return total


In [None]:
c = collections.Counter(all_tokens)
num_tokens = len(all_tokens)
probs = {word: count / num_tokens for word, count in c.items()}

Ok.  Now, we can create our three authors.

In [None]:
true_books = {
    'doyle': book_tokens[1],
    'austin': book_tokens[2],
    'christie': book_tokens[3],
}

# We can build authors from these writing samples.
authors = {
    name: Author(name, tokens, vocabulary_probabilities=probs) for name, tokens in true_books.items()
}


Now, given a piece of text, let us write a function that returns the most likely author.

In [None]:
def most_likely_author(authors, tokens):
    return max(authors, key=lambda a: authors[a].author_likelyhood(tokens))


Let us check that it works for the authors from which it has been trained -- it has better to!

In [None]:
# Tests 10 points: Training set is correctly attributed.
for name, tokens in true_books.items():
    most_likely = most_likely_author(authors, tokens)
    print(f"{name} is attributed to {most_likely}")
    assert name == most_likely


doyle is attributed to doyle
austin is attributed to austin
christie is attributed to christie


Now, let's check the other attributions!

In [None]:
for n in range(0, 10):
    print(most_likely_author(authors, book_tokens[n]))

austin
doyle
austin
christie
doyle
doyle
doyle
doyle
doyle
christie


Uhm, this is... ok... but: there are some books above that are not written by either of Doyle, Austin, or Christie.  We want a version of `most_likely_author` that can return a tuple, consisting of the best author, its log-likelihood, and the difference in log-likelyhood between first and second attribution.  
I let you write it.

In [None]:
def attribution(authors, tokens):
    """Returns a tuple consisting of:
    * name of the best author
    * log-likelyhood of the best author.
    * the difference in log-likelyhood between first and second most likely authors.
    """
    # Just to avoid everything breaking if you leave the placeholder blank.
    author = most_likely_author(authors, tokens)

    authors_map = {}
    for a in authors:
      author = authors[a]
      authors_map[a] = author.author_likelyhood(tokens)

    author2 = sorted(authors_map.items())[-2][0]

    result = (
        most_likely_author(authors, tokens),
        authors_map[most_likely_author(authors, tokens)],
        authors_map[most_likely_author(authors, tokens)] - authors_map[author2]
    )
    return result

Here is a test to help you check your code.

In [None]:
# Tests 10 points: attribution

a, l, d = attribution(authors, book_tokens[0])
print(a, l, d)
assert a == "austin"
assert -63000 < l < -62000
assert 6400 < d < 6500

austin -62545.83572405311 6490.46375791086


We can now print some attributions.

In [None]:
for n in range(0, 10):
    a, l, d = attribution(authors, book_tokens[n])
    print(f"Text {n}: Attributed to {a}, likelyhood {l}, difference from next likely: {d}")

Text 0: Attributed to austin, likelyhood -62545.83572405311, difference from next likely: 6490.46375791086
Text 1: Attributed to doyle, likelyhood -51049.5990972075, difference from next likely: 12372.111709314457
Text 2: Attributed to austin, likelyhood -54683.66775402256, difference from next likely: 14748.195974026072
Text 3: Attributed to christie, likelyhood -55587.56025347501, difference from next likely: 0.0
Text 4: Attributed to doyle, likelyhood -70292.65086164579, difference from next likely: 1078.1875740217802
Text 5: Attributed to doyle, likelyhood -75462.0292245311, difference from next likely: 98.37916958754067
Text 6: Attributed to doyle, likelyhood -63349.268280002, difference from next likely: 2726.5613402772287
Text 7: Attributed to doyle, likelyhood -72188.7588726332, difference from next likely: 844.6176847602474
Text 8: Attributed to doyle, likelyhood -76350.13362141213, difference from next likely: 470.24902889663645
Text 9: Attributed to christie, likelyhood -648

Now, this is fine, but we can try to improve this result.  One hypothesis is that rare words, which are used in only portions of the texts by an author, can sway the results.  It might be a better idea, to produce a "fingerprint" of an author, to consider the frequency of relatively frequent words only.
For example, we can discard both the most frequent, and the least frequent, words.

In [None]:
LOW_THRESHOLD = 50
HIGH_THRESHOLD = 2000

We can then filter only the tokens that are frequent enough, and redo our author model.

In [None]:
c = collections.Counter(all_tokens)
most_common = [t for t, _ in c.most_common(HIGH_THRESHOLD)[LOW_THRESHOLD:]]

def filter_tokens(tokens):
    return [t for t in tokens if t in most_common]

In [None]:
new_book_tokens = list(map(filter_tokens, book_tokens))
new_all_tokens = list(map(filter_tokens, all_tokens))
new_num_tokens = len(new_all_tokens)
c = collections.Counter(all_tokens)
new_probs = {word: count / new_num_tokens for word, count in c.items()}

In [None]:
new_true_books = {
    'doyle': new_book_tokens[1],
    'austin': new_book_tokens[2],
    'christie': new_book_tokens[3],
}

# We can build authors from these writing samples.
new_authors = {
    name: Author(name, tokens, alpha=100, vocabulary_probabilities=new_probs) for name, tokens in new_true_books.items()
}


Let's look at the attributions.

In [None]:
for n in range(0, 10):
    a, l, d = attribution(new_authors, new_book_tokens[n])
    print(f"Text {n}: Attributed to {a}, likelyhood {l}, difference from next likely: {d}")

Text 0: Attributed to austin, likelyhood -30567.859752043467, difference from next likely: 5480.522747214603
Text 1: Attributed to doyle, likelyhood -25072.632555619934, difference from next likely: 6850.968481753829
Text 2: Attributed to austin, likelyhood -27354.669893484013, difference from next likely: 9205.041184312522
Text 3: Attributed to christie, likelyhood -28518.835141421245, difference from next likely: 0.0
Text 4: Attributed to doyle, likelyhood -31739.92316895656, difference from next likely: 973.7540347390823
Text 5: Attributed to doyle, likelyhood -31619.023436477302, difference from next likely: 12.950940240451018
Text 6: Attributed to doyle, likelyhood -29387.587715004964, difference from next likely: 2357.2222331456687
Text 7: Attributed to doyle, likelyhood -30123.4028306915, difference from next likely: 818.883843137417
Text 8: Attributed to doyle, likelyhood -30777.66571182321, difference from next likely: 160.76767802321046
Text 9: Attributed to christie, likelyh

Now, there are two impostors in the list: two texts that are in fact not authored by any of Austin, Doyle, or Christie.  Can you tell which ones they are?  Hint: a measure of confidence is the difference between most likely attribution, and runner-up.

Give a response with the form:

    wrong_attributions = [6, 7]

where the numbers refer to the texts above.

In [None]:
# Set the wrong_attributions variable to the wrong attributions.

wrong_attributions = [3, 9]


In [None]:
# Tests 10 points: wrong attributions.

pass


Does this work well?  So-so, right?  We will try another approach to authorship attribution in a future homework.