In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
nbdir = "/content/gdrive/My Drive/DSCI521/Colab/02-textual/"

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/My\ Drive/DSCI521/Colab/02-textual/

/content/gdrive/My Drive/DSCI521/Colab/02-textual


# DSCI 521: Methods for analysis and interpretation <br>Chapter 2: Feature engineering and language processing

## Exercises
Note: numberings refer to the main notes.

#### 2.1.1.3 Exercise: Regex phone numbers
Read the file `phone-numbers.txt`. It contains a phone number in each line. \[Hint: use something like `lines = open("file.txt", "r").readlines()`\] Store only the phone numbers with the area code "215" in a list and print it out. Use regex-based pattern matching, not any other methods which occur to you.

In [3]:
import re
document = open("./data/phone-numbers.txt", "r").read()

numbers = re.findall('215-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]', 
                     document)
numbers

['215-345-3463', '215-756-8273']

#### 2.1.1.8 Exercise: Names of the gods
In the cell below is some text. It's an extract from [A Clash of Kings](https://www.goodreads.com/book/show/10572.A_Clash_of_Kings), specifically, about a character's prayer to some fictional gods. Use regex to extract the names of these gods. Your output should be a list that looks something like `["the Father", "the Mother", "the Warrior"]`.

In [4]:
text = 'Lost and weary, Catelyn Stark gave herself over to her gods. She knelt before the Smith, who fixed things that were broken, and asked that he give her sweet Bran his protection. She went to the Maid and beseeched her to lend her courage to Arya and Sansa, to guard them in their innocence. To the Father, she prayed for justice, the strength to seek it and the wisdom to know it, and she asked the Warrior to keep Robb strong and shield him in his battles. Lastly she turned to the Crone, whose statues often showed her with a lamp in one hand. "Guide me, wise lady," she prayed. "Show me the path I must walk, and do not let me stumble in the dark places that lie ahead."'

gods = re.findall("the [A-Z][a-z]+", text)

gods

['the Smith', 'the Maid', 'the Father', 'the Warrior', 'the Crone']

#### 2.1.2.4 Exercise: Improving a regex-based sentence tokenizer
First, write a few sentences in a complex (but grammatically acceptable) way so that the (above) regex-based tokenizer breaks. Then, fix the pattern so that the tokenizer can handle your text appropriately.

In [5]:
## regex-based sentence tokenizer
sentences = "With all due resp., I don't think this is a very good tokenization! Here's another one!"
sentences_tokenized = re.split("\s*(?<=[\.\?\!][^a-zA-Z0-9,])\s*", sentences)
sentences_tokenized

  return _compile(pattern, flags).split(string, maxsplit)


["With all due resp., I don't think this is a very good tokenization!",
 "Here's another one!"]

#### 2.1.3.2 Exercise: POS tagging 
Apply POS tagging to a sentence of your choosing and filter for only verbs and nouns.

In [6]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

running_sentence = "Use some of our test sentences; Joey's not very smart, nor charming."
doc = nlp(running_sentence)

print("token\tcoarse\tfine")
for token in doc:
    if token.pos_ in {"NOUN", "VERB", "PROPN"}:
        print(token.text + "\t" + token.pos_ + "\t" + token.tag_)

token	coarse	fine
Use	VERB	VB
test	NOUN	NN
sentences	NOUN	NNS
Joey	PROPN	NNP


#### 2.1.3.5 Exercise: using grammar for information extraction
Apply the spacy grammatical parsing and extract any subject-verb token pairs.

In [7]:
running_sentence = "Let's use another one. Anything else? Happy hour is tomorrow at 5:30 at Tap House where we will all meet up and say hi."
doc = nlp(running_sentence)

print("subject\tverb")
for token in doc:
    if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
        print(token.text + " "+ token.head.text)

subject	verb
's use
we meet


#### 2.1.4.4 Exercise: improved word frequency representation
Build a stop word list and lemmatization strategy (potentially using POS tags) to compute 'better' word frequencies, as you see fit.

In [8]:
from collections import Counter

text = """Word frequencies are probably the first and easiest 
numerical representation of text to compute. In some communities, 
this is referred to as the bag of words (BOW) model. 
Put simply, the BOW model simply counts up the 
number of times each word appears in a document. 
This of course depends on a few things, e.g., case and lemmatization. 
However, constructing a basic BOW model is quite straightforward, especially using `Counter`. 
Let's use this very paragraph as our example text for the BOW model."""

# in addition to excluding stop words, let's also exclude specific parts of speech, like determiners, particles,
# punctuation, and adpositions.

stop_words = {'\n', ',', '.', '`', 'the', 'and', 'of'}
excluded_pos = {"DET", "PART", "PUNCT", "ADP"}

doc = nlp(text)
word_counts = Counter()

for word in doc:
    if word.lemma_ not in stop_words and word.pos_ not in excluded_pos:
        word_counts[(word.lemma_)] += 1

word_counts.most_common(25)

[('model', 4),
 ('word', 3),
 ('be', 3),
 ('BOW', 3),
 ('text', 2),
 ('as', 2),
 ('simply', 2),
 ('use', 2),
 ('frequency', 1),
 ('probably', 1),
 ('first', 1),
 ('easy', 1),
 ('numerical', 1),
 ('representation', 1),
 ('compute', 1),
 ('community', 1),
 ('refer', 1),
 ('bag', 1),
 ('Put', 1),
 ('count', 1),
 ('number', 1),
 ('time', 1),
 ('appear', 1),
 ('document', 1),
 ('course', 1)]

#### 2.1.6.5 Exercise: exploring TF-IDF
Rank each of the example TF-IDF matrix's rows by TF-IDF values from high-to-low and interpret the kinds of words that have high TF-IDF values, i.e., are 'more important'. What about the low values, what kinds of words are these?

In [9]:
import numpy as np

def count_words(sentence):
    frequency = Counter()
    for word in sentence:
        frequency[word.text.lower()] += 1
    return frequency

text = '''Lost and weary, Catelyn Stark gave herself over to her gods. 
She knelt before the Smith, who fixed things that were broken, 
and asked that he give her sweet Bran his protection. 
She went to the Maid and beseeched her to lend her courage to Arya and Sansa, 
to guard them in their innocence. 
To the Father, she prayed for justice, the strength to seek it and the wisdom to know it, 
and she asked the Warrior to keep Robb strong and shield him in his battles. 
Lastly she turned to the Crone, whose statues often showed her with a lamp in one hand. 
"Guide me, wise lady," she prayed. 
"Show me the path I must walk, and do not let me stumble in the dark places that lie ahead."
'''

doc = nlp(text)
    
## the 'master' set, keeps track of the words in all documents
all_words = set()

## store the word frequencies by book
all_doc_frequencies = {}

## loop over the sentences
for j, sentence in enumerate(doc.sents):
    frequency = count_words(sentence)
    all_doc_frequencies[j] = frequency
    doc_words = set(frequency.keys())
    all_words = all_words.union(doc_words)
    
## create a matrix of zeros: (words) x (documents)
TDM = np.zeros((len(all_words),len(all_doc_frequencies)))
## fix a word ordering for the rows
all_words = sorted(list(all_words))
## loop over the (sorted) document numbers and (ordered) words; fill in matrix
for j in all_doc_frequencies:
    for i, word in enumerate(all_words):
        TDM[i,j] = all_doc_frequencies[j][word]

num_docs = TDM.shape[1]

## start off with a copy of our TDM (frequencies)
TFIDF = np.array(TDM)
## loop over words
for i, word in enumerate(all_words):
    ## count docs containing the word
    num_docs_containing_word = len([x for x in TDM[i] if x])
    ### computen the inverse document frequence of this word
    IDF = -np.log2(num_docs_containing_word/num_docs)
    ## multiply this row by the IDF to transform it to TFIDF
    TFIDF[i,] = TFIDF[i,]*IDF

In [10]:
for j in range(TFIDF.shape[1]):
    doc_vals = TFIDF[:,j]
    
    # make word and TF-IDF value tuples, put them in a list, sort the list according to TF-IDF values, then only keep words with non-zero TF-IDF 
    
    words_and_vals = [(word, val) for word, val in sorted(zip(all_words, doc_vals), key = lambda x: x[1], reverse = True) if val]
    print("For document #" + str(j) + ", words ranked according to TF-IDF are:\n")
    for word, val in words_and_vals:
        print(word + "\t" + str(round(val, 4)))
    print()

For document #0, words ranked according to TF-IDF are:

catelyn	2.8074
gave	2.8074
gods	2.8074
herself	2.8074
lost	2.8074
over	2.8074
stark	2.8074
weary	2.8074
her	0.8074
to	0.8074
and	0.4854

For document #1, words ranked according to TF-IDF are:

that	3.6147
before	2.8074
bran	2.8074
broken	2.8074
fixed	2.8074
give	2.8074
he	2.8074
knelt	2.8074
protection	2.8074
smith	2.8074
sweet	2.8074
things	2.8074
were	2.8074
who	2.8074
asked	1.8074
his	1.8074
her	0.8074
and	0.4854
she	0.4854
the	0.4854

For document #2, words ranked according to TF-IDF are:

to	3.2294
arya	2.8074
beseeched	2.8074
courage	2.8074
guard	2.8074
innocence	2.8074
lend	2.8074
maid	2.8074
sansa	2.8074
their	2.8074
them	2.8074
went	2.8074
her	1.6147
and	0.9709
in	0.8074
she	0.4854
the	0.4854

For document #3, words ranked according to TF-IDF are:

it	5.6147
to	3.2294
battles	2.8074
father	2.8074
for	2.8074
him	2.8074
justice	2.8074
keep	2.8074
know	2.8074
robb	2.8074
seek	2.8074
shield	2.8074
strength	2.8074
strong	2.807

It seems that words that are rare across documents have higher TF-IDF values. The lower the TF-IDF value, the more common the word.

## Additional In-depth Exercises

### A. Constructing co-occurrence matrix statistics

#### A.1 Build a tokenizer
To start, build a tokenization function called `tokens = tokenize(text, space = False)` that accepts a string called `text`, in addition to a boolean argument called `space`, which if positive will allow the tokenize function to determine if whitespace characters (at all) should be stored as a part of the list of `tokens` output.

For this part of the exercise, use the character-class `'[0-9a-zA-Z'-]'` (or it's complimentary character class) to split on non-delimiters, but be sure to capture all portions of the text that are 'split' using a grouping mechanism. Likewise, ensure that all non-word-type tokens are completely resolved, e.g., there _shouldn't_ be any tokens which consist of multiple punctuation characters, such as `".\""`, which should be sub-divided into multiple tokens.

Likewise, be sure to collapse any multiple whitespace `" "` characters down to just one as an initial pre-processing step to the `text`.

In [11]:
def tokenize(text, space = False):
    text = re.sub("[ ]+", " ", text)
    tokens = []
    for token in re.split("([0-9a-zA-Z'-]+)", text):
        if not space:
            token = re.sub("[ ]+", "", token)
        if not token:
            continue
        if re.search("[0-9a-zA-Z'-]", token):                    
            tokens.append(token)
        else: 
            tokens.extend(token)
    return tokens

In [12]:
tokenize('This is an example with the space flag as True!', True)

['This',
 ' ',
 'is',
 ' ',
 'an',
 ' ',
 'example',
 ' ',
 'with',
 ' ',
 'the',
 ' ',
 'space',
 ' ',
 'flag',
 ' ',
 'as',
 ' ',
 'True',
 '!']

In [13]:
tokenize('This is an example with the space flag as False!', False)

['This',
 'is',
 'an',
 'example',
 'with',
 'the',
 'space',
 'flag',
 'as',
 'False',
 '!']

#### A.2 Build a word-sentence tokenizer
Here, the goal will be to produce a two-level tokenization utility that is similar to what Spacy produces:

In [14]:
for s in doc.sents:
    print([w.text for w in s])

['Lost', 'and', 'weary', ',', 'Catelyn', 'Stark', 'gave', 'herself', 'over', 'to', 'her', 'gods', '.', '\n']
['She', 'knelt', 'before', 'the', 'Smith', ',', 'who', 'fixed', 'things', 'that', 'were', 'broken', ',', '\n', 'and', 'asked', 'that', 'he', 'give', 'her', 'sweet', 'Bran', 'his', 'protection', '.', '\n']
['She', 'went', 'to', 'the', 'Maid', 'and', 'beseeched', 'her', 'to', 'lend', 'her', 'courage', 'to', 'Arya', 'and', 'Sansa', ',', '\n', 'to', 'guard', 'them', 'in', 'their', 'innocence', '.', '\n']
['To', 'the', 'Father', ',', 'she', 'prayed', 'for', 'justice', ',', 'the', 'strength', 'to', 'seek', 'it', 'and', 'the', 'wisdom', 'to', 'know', 'it', ',', '\n', 'and', 'she', 'asked', 'the', 'Warrior', 'to', 'keep', 'Robb', 'strong', 'and', 'shield', 'him', 'in', 'his', 'battles', '.', '\n']
['Lastly', 'she', 'turned', 'to', 'the', 'Crone', ',', 'whose', 'statues', 'often', 'showed', 'her', 'with', 'a', 'lamp', 'in', 'one', 'hand', '.', '\n']
['"', 'Guide', 'me', ',', 'wise', 'lad

with the caveat that we use our own tokenization utility (which can be flagged to retain space characters).
Since this will then require the utilization of a sentence tokenizer, download `nltk` (if you haven't already) and utilize its `sent_tokenize()` function.

In [17]:
from nltk import sent_tokenize
import nltk
nltk.download('punkt')

def word_sentence_tokenize(text, space = False):
    sentences = []
    for sentence in sent_tokenize(text):
        sentences.append(tokenize(sentence.strip(), space))
    return sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [18]:
print(word_sentence_tokenize(text))

[['Lost', 'and', 'weary', ',', 'Catelyn', 'Stark', 'gave', 'herself', 'over', 'to', 'her', 'gods', '.'], ['She', 'knelt', 'before', 'the', 'Smith', ',', 'who', 'fixed', 'things', 'that', 'were', 'broken', ',', '\n', 'and', 'asked', 'that', 'he', 'give', 'her', 'sweet', 'Bran', 'his', 'protection', '.'], ['She', 'went', 'to', 'the', 'Maid', 'and', 'beseeched', 'her', 'to', 'lend', 'her', 'courage', 'to', 'Arya', 'and', 'Sansa', ',', '\n', 'to', 'guard', 'them', 'in', 'their', 'innocence', '.'], ['To', 'the', 'Father', ',', 'she', 'prayed', 'for', 'justice', ',', 'the', 'strength', 'to', 'seek', 'it', 'and', 'the', 'wisdom', 'to', 'know', 'it', ',', '\n', 'and', 'she', 'asked', 'the', 'Warrior', 'to', 'keep', 'Robb', 'strong', 'and', 'shield', 'him', 'in', 'his', 'battles', '.'], ['Lastly', 'she', 'turned', 'to', 'the', 'Crone', ',', 'whose', 'statues', 'often', 'showed', 'her', 'with', 'a', 'lamp', 'in', 'one', 'hand', '.'], ['"', 'Guide', 'me', ',', 'wise', 'lady', ',', '"', 'she', 'pr

#### A.3 Try to re-construct the document
Now that we have the two-stage tokenizer which can retain space characters, let's try an re-construct a document from its tokenization, with and without `space=True`.

In particular, consider how to re-join the elements of the two-level list (sentences) of lists (words) of strings by a delimiter so as to re-construct the document.

In [19]:
## with space=True
"".join(["".join(sentence) for sentence in word_sentence_tokenize(text, True)])

'Lost and weary, Catelyn Stark gave herself over to her gods.She knelt before the Smith, who fixed things that were broken, \nand asked that he give her sweet Bran his protection.She went to the Maid and beseeched her to lend her courage to Arya and Sansa, \nto guard them in their innocence.To the Father, she prayed for justice, the strength to seek it and the wisdom to know it, \nand she asked the Warrior to keep Robb strong and shield him in his battles.Lastly she turned to the Crone, whose statues often showed her with a lamp in one hand."Guide me, wise lady," she prayed."Show me the path I must walk, and do not let me stumble in the dark places that lie ahead."'

In [20]:
## with space=False
" ".join([" ".join(sentence) for sentence in word_sentence_tokenize(text, False)])

'Lost and weary , Catelyn Stark gave herself over to her gods . She knelt before the Smith , who fixed things that were broken , \n and asked that he give her sweet Bran his protection . She went to the Maid and beseeched her to lend her courage to Arya and Sansa , \n to guard them in their innocence . To the Father , she prayed for justice , the strength to seek it and the wisdom to know it , \n and she asked the Warrior to keep Robb strong and shield him in his battles . Lastly she turned to the Crone , whose statues often showed her with a lamp in one hand . " Guide me , wise lady , " she prayed . " Show me the path I must walk , and do not let me stumble in the dark places that lie ahead . "'

#### A.4 Write a function that loads/processes a document from file
Write a function called `load_data(path, space = False)` which accepts a `path` string to identify the direct location of a text file. Upon loading the specified file, construct an (output) dictionary called `data` with three key-value pairs:

- `'sentences'`: output of word_sentence_tokenize applied to document,
- `'counts'`: a dictionary of integer counts of all tokens in the document,
- `'type_index'`: a dictionary linking tokens to indices for their order of appearance.

Test this code on the books in the local `'./data/books/'` directory, e.g., `'./data/books/84.txt'` is a copy of "Frankenstein..." (other metadata can be found in `'./data/books/metadata.json'`).

In [21]:
def load_data(path, space = False):
    data = {'sentences': word_sentence_tokenize(open(path).read().strip().lower(), space)}
    data['counts'] = dict(Counter([t for s in data['sentences'] for t in s]))
    data['type_index'] = {t: i for i, t in enumerate(data['counts'])}
    return data

In [22]:
data = load_data('./data/books/84.txt', True)

#### A.5 Build a context generator
Now write a function called `get_context(i, sentence, m = 0, weight = 0)` to produce a 'sliding-window' context (list of surrounding tokens) for the token of index `i` in an already tokenized `sentence` (a list of strings). Optional non-negative arguments `m` (an integer) and `weight` (a float) specify the size of the context window and the relative weights of context elements.

Specifically, `m` tokens should be taken to both the left and right of token `i` (all should be taken when the default `m=0` is set. 

Finally, `weight` should determine how to return in a list named `weights`, which should be numeric and of length equal to that of the `context`. The contents of `weights` should be the reciprocal of the absolute distance to the center token, i.e., the token of index `i`---_raised to the power valued by `weight`_. Note: this ensures setting `weight=0` 'turns off' the weights.

In [23]:
def get_context(i, sentence, m = 0, weight = 0):
    context = np.array(sentence)
    weights = np.abs(np.array(range(len(sentence))) - i)
    if m:
        mask = (weights != 0) & (weights <= m)
    else:
        mask = (weights != 0)
    context = context[mask]
    if weight:
        weights = 1/(weights[mask]**weight)
    else:
        weights = weights[mask]*weight + 1.
    return context, weights

In [24]:
get_context(4, tokenize('This is an example with the space flag as False!', False), 0, 1)

(array(['This', 'is', 'an', 'example', 'the', 'space', 'flag', 'as',
        'False', '!'], dtype='<U7'),
 array([0.25      , 0.33333333, 0.5       , 1.        , 1.        ,
        0.5       , 0.33333333, 0.25      , 0.2       , 0.16666667]))

#### A.6 Compute a co-occurrence matrix
Finally, we'll utilize our context model and two-stage tokenizer to build a co-occurrence matrix with weighted contexts.

In particular, build a function called `compute_co_occurrence_matrix(data, m = 0, weight = 0)` that accepts the `data` output from `load_data()` and constructs `X`&mdash;an `N` (the vocabulary size) by `N` matrix with each row (token) and column (context) corresponding to the _total `weight`_ in which context tokens appear in the `m`-context windows of 'center' tokens. 

Note: the rows and columns of `X` should be in the order specified by `data['type_index']`.

In [25]:
def compute_co_occurrence_matrix(data, m = 0, weight = 0):
    N = len(data['type_index'])
    X = np.zeros((N, N))    
    for sentence in data['sentences']:
        for i, t in enumerate(sentence):
            context, weights = get_context(i, sentence, m, weight)            
            for c, w in zip(context, weights):
                X[data['type_index'][t],data['type_index'][c]] += w
    return X

In [26]:
X = compute_co_occurrence_matrix(data, m = 0, weight = 1)

#### A.7 Build a similarity function to sanity check our model
Here, we should build a cosine-similarity comparer: `most_similar(t, type_index, X, top=10)` that accepts a token `t` and the `type_index` (from `data['type_index']`), the latter of which should link any string to the rows/columns of `X`. The final arguemnt `top` specifies how many results the function should produce in output. Finally, this output should (as in Chapter 1) consist of a sorted (high-to-low, by similarity) list of `(token, similarity)` tuples.

In [27]:
def most_similar(t, type_index, X, top=10):
    vec = X / np.linalg.norm(X, axis=1)[:, np.newaxis]
    v = vec[type_index[t],:]
    similar = sorted(enumerate(list(vec.dot(v))), 
                     key = lambda x: x[1], reverse = True)
    types = list(type_index.keys())
    if not top: top = len(vec.shape[0])
    sims = [(types[ix], sim) for ix, sim in similar[:top] if ix != type_index[t]]
    return sims
most_similar(' ', data['type_index'], X, top=10)

[('\n', 0.9979221135186093),
 ('night', 0.9964860035918699),
 ('youth', 0.9963653051872496),
 ('rest', 0.996178645914909),
 ('conversation', 0.9961603312382916),
 ('light', 0.9961374186551338),
 ('murder', 0.9960937314047589),
 ('work', 0.9960868608188356),
 ('town', 0.9960474824393188)]

In [28]:
most_similar('.', data['type_index'], X, top=10)

[(',', 0.9987401007903789),
 (';', 0.9983632870995872),
 ('from', 0.9979362111260544),
 ('with', 0.9978875525838681),
 ('in', 0.9978153315451614),
 ('gave', 0.9977828700178513),
 ('made', 0.9977621623725164),
 ('the', 0.9977579179934447),
 ('every', 0.997696092191149)]

In [29]:
most_similar('she', data['type_index'], X, top=10)

[('he', 0.9992915660314543),
 ('i', 0.9992414557834342),
 ('it', 0.9989050254483557),
 ('her', 0.998874469254197),
 ('who', 0.9987882861329731),
 ('very', 0.9987420408496845),
 ('so', 0.9987201503754392),
 ('all', 0.9987188616943543),
 ('and', 0.9987021137096252)]

In [30]:
most_similar('he', data['type_index'], X, top=10)

[('i', 0.9996188111294829),
 ('she', 0.9992915660314543),
 ('so', 0.9992129104853836),
 ('and', 0.9991085711836337),
 ('all', 0.9990503074413479),
 ('who', 0.999023137972763),
 ('this', 0.9990098627247053),
 ('we', 0.9989412910775652),
 ('or', 0.9989385808026763)]