In [1]:
corpus = ["I am an invisible man",
          "The story so far: in the beginning, the universe was created. This has made a lot of people very angry and been widely regarded as a bad move",
          "Mother died today. Or maybe, yesterday; I can't be sure",
          "It was a queer, sultry summer, the summer they electrocuted the Rosenbergs, and I didn’t know what I was doing in New York",
          "Ships at a distance have every man’s wish on board",
          "We were somewhere around Barstow on the edge of the desert when the drugs began to take hold",
          "It was a bright cold day in April, and the clocks were striking thirteen",
          "As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic insect"]

document = corpus[2]
print(document.split())

['Mother', 'died', 'today.', 'Or', 'maybe,', 'yesterday;', 'I', "can't", 'be', 'sure']


In [2]:
# %%
import nltk
import nltk.tokenize

# download the most recent punkt package
nltk.download('punkt', quiet=True)

document = corpus[3]
print(nltk.tokenize.word_tokenize(document, language='english'))

['It', 'was', 'a', 'queer', ',', 'sultry', 'summer', ',', 'the', 'summer', 'they', 'electrocuted', 'the', 'Rosenbergs', ',', 'and', 'I', 'didn', '’', 't', 'know', 'what', 'I', 'was', 'doing', 'in', 'New', 'York']


In [3]:
#we will only take sentances without certain puncuation marks

import re


PUNCT_RE = re.compile(r'[^\w\s]+$')


def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.

    Arguments:
        string (str): a string to check for punctuation markers.

    Returns:
        bool: True is string is a (sequence of) punctuation marker(s),
            False otherwise.

    Examples:
        >>> is_punct("!")
        True
        >>> is_punct("Bonjour!")
        False
        >>> is_punct("¿Te gusta el verano?")
        False
        >>> is_punct("...")
        True
        >>> is_punct("«»...")
        True

    """
    return PUNCT_RE.match(string) is not None

In [4]:
tokens = nltk.tokenize.word_tokenize(corpus[2], language='english')

# Loop with a standard for-loop
tokenized = []
for token in tokens:
    if not is_punct(token):
        tokenized.append(token)
print(tokenized)

['Mother', 'died', 'today', 'Or', 'maybe', 'yesterday', 'I', 'ca', "n't", 'be', 'sure']


In [5]:
def preprocess_text(text, language, lowercase=True):
    """Preprocess a text.

    Perform a text preprocessing procedure, which transforms a string
    object into a list of word tokens without punctuation markers.

    Arguments:
        text (str): a string representing a text.
        language (str): a string specifying the language of text.
        lowercase (bool, optional): Set to True to lowercase all
            word tokens. Defaults to True.

    Returns:
        list: a list of word tokens extracted from text, excluding
            punctuation.

    Examples:
        >>> preprocess_text("Ah! Monsieur, c'est donc vous?", 'french')
        ["ah", "monsieur", "c'est", "donc", "vous"]

    """
    if lowercase:
        text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens

In [6]:
for document in corpus[2:4]:
    print('Original:', document)
    print('Tokenized:', preprocess_text(document, 'english'))

Original: Mother died today. Or maybe, yesterday; I can't be sure
Tokenized: ['mother', 'died', 'today', 'or', 'maybe', 'yesterday', 'i', 'ca', "n't", 'be', 'sure']
Original: It was a queer, sultry summer, the summer they electrocuted the Rosenbergs, and I didn’t know what I was doing in New York
Tokenized: ['it', 'was', 'a', 'queer', 'sultry', 'summer', 'the', 'summer', 'they', 'electrocuted', 'the', 'rosenbergs', 'and', 'i', 'didn', 't', 'know', 'what', 'i', 'was', 'doing', 'in', 'new', 'york']


In [7]:
import collections

vocabulary = collections.Counter()
for document in corpus:
    vocabulary.update(preprocess_text(document, 'english'))

In [8]:
print(vocabulary.most_common(n=5))

[('the', 9), ('a', 6), ('i', 4), ('in', 4), ('was', 4)]


In [9]:
print('Original vocabulary size:', len(vocabulary))
pruned_vocabulary = {token for token, count in vocabulary.items() if count > 1}
print(pruned_vocabulary)
print('Pruned vocabulary size:', len(pruned_vocabulary))

Original vocabulary size: 100
{'in', 'man', 'i', 'the', 'was', 'and', 'summer', 'on', 'of', 'it', 'were', 'as', 'a'}
Pruned vocabulary size: 13


In [10]:
def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    """Extract a vocabulary from a tokenized corpus.

    Arguments:
        tokenized_corpus (list): a tokenized corpus represented, list
            of lists of strings.
        min_count (int, optional): the minimum occurrence count of a
            vocabulary item in the corpus.
        max_count (int, optional): the maximum occurrence count of a
            vocabulary item in the corpus. Defaults to inf.

    Returns:
        list: An alphabetically ordered list of unique words in the
            corpus, of which the frequencies adhere to the specified
            minimum and maximum count.

    Examples:
        >>> corpus = [['the', 'man', 'love', 'man', 'the'],
                      ['the', 'love', 'book', 'wise', 'drama'],
                      ['a', 'story', 'book', 'drama']]
        >>> extract_vocabulary(corpus, min_count=2)
        ['book', 'drama', 'love', 'man', 'the']

    """
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

In [11]:
tokenized_corpus = [preprocess_text(document, 'english') for document in corpus]
vocabulary = extract_vocabulary(tokenized_corpus)

In [12]:

bags_of_words = []
for document in tokenized_corpus:
    tokens = [word for word in document if word in vocabulary]
    bags_of_words.append(collections.Counter(tokens))

print(bags_of_words[5])

Counter({'the': 3, 'we': 1, 'were': 1, 'somewhere': 1, 'around': 1, 'barstow': 1, 'on': 1, 'edge': 1, 'of': 1, 'desert': 1, 'when': 1, 'drugs': 1, 'began': 1, 'to': 1, 'take': 1, 'hold': 1})


Novel Analysis

In [13]:
import nltk
import nltk.tokenize

# download the most recent punkt package
nltk.download('punkt', quiet=True)

corpus = ["alice.txt","anne.txt","oz.txt"]
titles = ["Alice in Wonderland", "Anne of Green Gables", "Wizard of Oz"]
documents = []
for url in corpus:
    f = open(url, encoding='utf-8')
    text = f.read()
    documents.append(text)
print(documents[1][0:100])

FileNotFoundError: [Errno 2] No such file or directory: 'alice.txt'

In [None]:
import re


PUNCT_RE = re.compile(r'[^\w\s]+$')


def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.
    """
    return PUNCT_RE.match(string) is not None

In [None]:
def preprocess_text(text, language, lowercase=True):
    """Preprocess a text.

    Perform a text preprocessing procedure, which transforms a string
    object into a list of word tokens without punctuation markers.

    """
    if lowercase:
        text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens

In [None]:
tokenized = []
for text in documents:
    tokenized.append(preprocess_text(text, "english"))

print(tokenized[0][11])

In [None]:
def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    """Extract a vocabulary from a tokenized corpus.

    Arguments:
        tokenized_corpus (list): a tokenized corpus represented, list
            of lists of strings.
        min_count (int, optional): the minimum occurrence count of a
            vocabulary item in the corpus.
        max_count (int, optional): the maximum occurrence count of a
            vocabulary item in the corpus. Defaults to inf.

    Returns:
        list: An alphabetically ordered list of unique words in the
            corpus, of which the frequencies adhere to the specified
            minimum and maximum count.

    Examples:
        >>> corpus = [['the', 'man', 'love', 'man', 'the'],
                      ['the', 'love', 'book', 'wise', 'drama'],
                      ['a', 'story', 'book', 'drama']]
        >>> extract_vocabulary(corpus, min_count=2)
        ['book', 'drama', 'love', 'man', 'the']

    """
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

#dont show any words that appear less than min count
import collections
vocabulary = extract_vocabulary(tokenized, min_count=2)
print(vocabulary[0:100])

In [None]:
def corpus2dtm(tokenized_corpus, vocabulary):
    """Transform a tokenized corpus into a document-term matrix.

    Arguments:
        tokenized_corpus (list): a tokenized corpus as a list of
        lists of strings. vocabulary (list): An list of unique words.

    Returns:
        list: A list of lists representing the frequency of each term
              in `vocabulary` for each document in the corpus.

    Examples:
        >>> tokenized_corpus = [['the', 'man', 'man', 'smart'],
                                ['a', 'the', 'man' 'love'],
                                ['love', 'book', 'journey']]
        >>> vocab = ['book', 'journey', 'man', 'love']
        >>> corpus2dtm(tokenized_corpus, vocabulary)
        [[0, 0, 2, 0], [0, 0, 1, 1], [1, 1, 0, 1]]

    """
    document_term_matrix = []
    for document in tokenized_corpus:
        document_counts = collections.Counter(document)
        row = [document_counts[word] for word in vocabulary]
        document_term_matrix.append(row)
    return document_term_matrix

import numpy as np

document_term_matrix = np.array(corpus2dtm(tokenized, vocabulary))
girl_id = vocabulary.index('girl')
boy_id = vocabulary.index('boy')

girl_counts = document_term_matrix[:, girl_id]
boy_counts = document_term_matrix[:, boy_id]
print("Girl: " + str(girl_counts))
print("Boy: " + str(boy_counts))

In [None]:
import matplotlib.pyplot as plt

x = np.arange(len(titles))
width = 0.3

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, girl_counts, width, label='Girl')
rects2 = ax.bar(x + width/2, boy_counts, width, label='Boy')

ax.set_ylabel('Word Count')
ax.set_title('Gender Binary Word Frequency')
ax.set_xticks(x)
ax.set_xticklabels(titles)
ax.legend()

fig.tight_layout()

plt.show()

In [None]:
alice = np.array([girl_counts[0], boy_counts[0]])
anne = np.array([girl_counts[1], boy_counts[1]])
oz = np.array([girl_counts[2], boy_counts[2]])

In [None]:
def euclidean_distance(a, b):
    """Compute the Euclidean distance between two vectors.

    Note: ``numpy.linalg.norm(a - b)`` performs the
    same calculation using a slightly faster method.

    Arguments:
        a (numpy.ndarray): a vector of floats or ints.
        b (numpy.ndarray): a vector of floats or ints.

    Returns:
        float: The euclidean distance between vector a and b.

    Examples:
        >>> import numpy as np
        >>> a = np.array([1, 4, 2, 8])
        >>> b = np.array([2, 1, 4, 7])
        >>> round(euclidean_distance(a, b), 2)
        3.87

    """
    return np.sqrt(np.sum((a - b) ** 2))

aliceanne = euclidean_distance(alice, anne)
print(f'Alice - Anne: {aliceanne:.2f}')

aliceoz = euclidean_distance(alice, oz)
print(f'Alice - Oz: {aliceoz:.2f}')

anneoz = euclidean_distance(anne, oz)
print(f'Anne - Oz: {anneoz:.2f}')