In [1]:
import os
import csv
import subprocess
import re
import random
import numpy as np
from itertools import islice


def read_in_shakespeare():
    '''Reads in the Shakespeare dataset processes it into a list of tuples.
    Also reads in the vocab and play name lists from files.
    Each tuple consists of
    tuple[0]: The name of the play
    tuple[1] A line from the play as a list of tokenized words.
    
    Returns:
    tuples: A list of tuples in the above format.
    document_names: A list of the plays present in the corpus.
    vocab: A list of all tokens in the vocabulary.
    '''
    tuples = []
    with open('Data/will_play_text.csv') as f:
        csv_reader = csv.reader(f, delimiter=';')
        for row in csv_reader:
            play_name = row[1]
            line = row[5]
            line_tokens = re.sub(r'[^a-zA-Z0-9\s]', ' ', line).split()
            line_tokens = [token.lower() for token in line_tokens]
            tuples.append((play_name, line_tokens))
            
    with open('Data/vocab.txt') as f:
        vocab =  [line.strip() for line in f]
    
    with open('Data/play_names.txt') as f:
        document_names =  [line.strip() for line in f]
        
    return tuples, document_names, vocab

def get_row_vector(matrix, row_id):
    return matrix[row_id, :]

def get_column_vector(matrix, col_id):
    return matrix[:, col_id]

In [2]:
def create_term_document_matrix(line_tuples, document_names, vocab):
    '''Returns a numpy array containing the term document matrix for the input lines.
    Inputs:
    line_tuples: A list of tuples, containing the name of the document and 
    a tokenized line from that document.
    document_names: A list of the document names
    vocab: A list of the tokens in the vocabulary
    
    # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:39 PM.
    
    Let m = len(vocab) and n = len(document_names).
    
    Returns:
    td_matrix: A mxn numpy array where the number of rows is the number of words
                and each column corresponds to a document. A_ij contains the
                frequency with which word i occurs in document j.
    '''
    vocab_to_ID = dict(zip(vocab, range(0, len(vocab))))
    docname_to_ID = dict(zip(document_names, range(0, len(document_names))))
    
    m = len(vocab)
    n = len(document_names)
    term_doc_mat = np.zeros([m, n])
    
    for line_tuple in line_tuples:
        doc_name = line_tuple[0]
        doc_ID = docname_to_ID[doc_name] # get ID corresponding to document
        line = line_tuple[1]
        for word in line:
            word_ID = vocab_to_ID[word] # get ID corresponding to word
            term_doc_mat[word_ID][doc_ID] += 1 # increment word count
    
    return term_doc_mat

In [5]:
def create_term_context_matrix(line_tuples, vocab, context_window_size = 1):
    '''Returns a numpy array containing the term context matrix for the input lines.
    Inputs:
    line_tuples: A list of tuples, containing the name of the document and 
                    a tokenized line from that document.
    vocab: A list of the tokens in the vocabulary
    
    # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:39 PM.
    
    Let n = len(vocab).
    
    Returns:
    tc_matrix: A nxn numpy array where A_ij contains the frequency with which
                word j was found within context_window_size to the left or right of
                word i in any sentence in the tuples.
  '''
    vocab_to_ID = dict(zip(vocab, range(0, len(vocab))))
    
    m = len(vocab)
    term_term_mat = np.zeros([m, m])

    for line_tuple in line_tuples:
        doc_ID = line_tuple[0]
        line = line_tuple[1]
        for word_idx, word in enumerate(line):
            word_ID = vocab_to_ID[word] # target word
            for context_word_idx in range(1, context_window_size + 1):
                # look behind
                prev_word_idx = word_idx - context_word_idx
                if prev_word_idx >= 0:
                    prev_word = line[prev_word_idx]
                    prev_word_ID = vocab_to_ID[prev_word]
                    term_term_mat[word_ID, prev_word_ID] += 1
                # look ahead
                next_word_idx = word_idx + context_word_idx
                if next_word_idx < len(line):
                    next_word = line[next_word_idx]
                    next_word_ID = vocab_to_ID[next_word]
                    term_term_mat[word_ID, next_word_ID] += 1
         
    return term_term_mat

In [6]:
line_tuples, document_names, vocab = read_in_shakespeare()
# term_term_matrix = create_term_context_matrix(line_tuples, vocab, context_window_size = 2)
# term_term_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
def create_PPMI_matrix(term_context_matrix):
    '''Given a term context matrix, output a PPMI matrix.
    See section 15.1 in the textbook.
    
    Hint: Use numpy matrix and vector operations to speed up implementation.
    
    Input:
    term_context_matrix: A nxn numpy array, where n is
                            the numer of tokens in the vocab.
                            
    Returns: A nxn numpy matrix, where A_ij is equal to the
                point-wise mutual information between the ith word
                and the jth word in the term_context_matrix.
  '''       
    
    # YOUR CODE HERE
    return None

def create_tf_idf_matrix(term_document_matrix):
    '''Given the term document matrix, output a tf-idf weighted version.
    
    See section 15.2.1 in the textbook.
    
    Hint: Use numpy matrix and vector operations to speed up implementation.
    
    Input:
    term_document_matrix: Numpy array where each column represents a document 
                            and each row, the frequency of a word in that document.
    
    Returns:
    A numpy array with the same dimension as term_document_matrix, where
    A_ij is weighted by the inverse document frequency of document h.
  '''
    
    # YOUR CODE HERE
    return None

def compute_cosine_similarity(vector1, vector2):
    '''Computes the cosine similarity of the two input vectors.
    
    Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array
    
    Returns:
    A scalar similarity value.
  '''
    
    # YOUR CODE HERE
    return -1

def compute_jaccard_similarity(vector1, vector2):
    '''Computes the cosine similarity of the two input vectors.
    Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array
    
    Returns:
    A scalar similarity value.
  '''
    # YOUR CODE HERE
    return -1

def compute_dice_similarity(vector1, vector2):
    '''Computes the cosine similarity of the two input vectors.
    Inputs:
    vector1: A nx1 numpy array
    vector2: A nx1 numpy array
    
    Returns:
    A scalar similarity value.
  '''
    # YOUR CODE HERE
    return -1

def rank_plays(target_play_index, term_document_matrix, similarity_fn):
    ''' Ranks the similarity of all of the plays to the target play.
    
    # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:51 PM.
    
    Inputs:
    target_play_index: The integer index of the play we want to compare all others against.
    term_document_matrix: The term-document matrix as a mxn numpy array.
    similarity_fn: Function that should be used to compared vectors for two
                    documents. Either compute_dice_similarity, compute_jaccard_similarity, or
                    compute_cosine_similarity.
                    
    Returns:
    A length-n list of integer indices corresponding to play names,
    ordered by decreasing similarity to the play indexed by target_play_index
  '''
    
    # YOUR CODE HERE
    return []

def rank_words(target_word_index, matrix, similarity_fn):
    ''' Ranks the similarity of all of the words to the target word.
    # NOTE: THIS DOCSTRING WAS UPDATED ON JAN 24, 12:51 PM.
    Inputs:
    target_word_index: The index of the word we want to compare all others against.
    matrix: Numpy matrix where the ith row represents a vector embedding of the ith word.
    similarity_fn: Function that should be used to compared vectors for two word
                    embeddings. Either compute_dice_similarity, compute_jaccard_similarity, or
                    compute_cosine_similarity.
    
    Returns:
    A length-n list of integer word indices, ordered by decreasing similarity to the 
    target word indexed by word_index
  '''
    # YOUR CODE HERE
    return []


if __name__ == '__main__':
    tuples, document_names, vocab = read_in_shakespeare()
    
    print('Computing term document matrix...')
    td_matrix = create_term_document_matrix(tuples, document_names, vocab)
    
    print('Computing tf-idf matrix...')
    tf_idf_matrix = create_tf_idf_matrix(td_matrix)

    print('Computing term context matrix...')
    tc_matrix = create_term_context_matrix(tuples, vocab, context_window_size=2)
    
    print('Computing PPMI matrix...')
    PPMI_matrix = create_PPMI_matrix(tc_matrix)

    random_idx = random.randint(0, len(document_names)-1)
    similarity_fns = [compute_cosine_similarity, compute_jaccard_similarity, compute_dice_similarity]
    for sim_fn in similarity_fns:
        print('\nThe 10 most similar plays to "%s" using %s are:' % (document_names[random_idx], sim_fn.__qualname__))
        ranks = rank_plays(random_idx, td_matrix, sim_fn)
        for idx in range(0, 10):
            doc_id = ranks[idx]
            print('%d: %s' % (idx+1, document_names[doc_id]))
            
    word = 'juliet'
    vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
    for sim_fn in similarity_fns:
        print('\nThe 10 most similar words to "%s" using %s on term-context frequency matrix are:' % (word, sim_fn.__qualname__))
        ranks = rank_words(vocab_to_index[word], tc_matrix, sim_fn)
        for idx in range(0, 10):
            word_id = ranks[idx]
            print('%d: %s' % (idx+1, vocab[word_id]))

    word = 'juliet'
    vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
    for sim_fn in similarity_fns:
        print('\nThe 10 most similar words to "%s" using %s on PPMI matrix are:' % (word, sim_fn.__qualname__))
        ranks = rank_words(vocab_to_index[word], PPMI_matrix, sim_fn)
        for idx in range(0, 10):
            word_id = ranks[idx]
            print('%d: %s' % (idx+1, vocab[word_id]))