In [1]:
# to work with phonemes: http://www.nltk.org/_modules/nltk/corpus/reader/cmudict.html
# from nltk.corpus import brown
# from nltk.corpus import gutenberg
# from nltk.corpus import cmudict
import nltk
nltk.download()

showing info http://www.nltk.org/nltk_data/


True

In [2]:
import nltk
import os
import re
import sys
import math
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from utils import random_idx
from utils import utils
from utils import lang_vectors_utils as lvu

k = 5000
N = 10000
cluster_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
ordered = 1
alphabet = string.lowercase + ' '
# Normalize
RI_letters = random_idx.generate_letter_id_vectors(N, k, alphabet)/float(math.sqrt(N))
file_ids = nltk.corpus.gutenberg.fileids()
num_iterations = 1

def read_text(indices=[]):
    words = []
    for index in indices:
        corpus = [word.encode("ascii").lower() for word in nltk.corpus.gutenberg.words(file_ids[index])]
        # preserves indices for testing
        start_ignore, end_ignore = -1, -1
        for i in range(len(corpus)):
            if corpus[i] == "[":
                start_ignore = i
            if corpus[i] == "]":
                end_ignore = i
            if end_ignore != -1 and corpus[i].isalpha():
                words.append(corpus[i])
    return words
    
def seed(seed_index):
    #n_gram_frequencies = lvu.initialize(cluster_sizes)
    n_gram_frequencies = [{} for _ in range(len(cluster_sizes) + 1)]
    # save vectors to file
    fwrite = open("intermediate/n_gram_frequencies", "w")
    pickle.dump(n_gram_frequencies, fwrite)
    fwrite.close()
    words = read_text([seed_index])
    
    stream = "".join(words)
    text = " ".join(words)

    # nltk freq dist
    # http://www.nltk.org/howto/probability.html
    fd = nltk.FreqDist(words)
    word_freqs = fd.most_common()
    #print word_freqs
    for wf in word_freqs:
        n_gram_frequencies[len(wf[0])][wf[0]] = wf[1]
        
    lvu.write_data_structures([n_gram_frequencies], ["intermediate/n_gram_frequencies"])
    return n_gram_frequencies, stream, text
    
# converts estimated words to an array of text
def post_process(tuples, text):
    texted = []
    for tup in tuples:
        word = text[tup[0]:tup[1]]
        texted.append(word)
    return texted
  
def unexplained(vocab, text):
    matches = [[m.group(0), [m.start(), m.end()-1]] for m in re.finditer(r'\S+', text)]
    unprocessed_words, unprocessed_indices = zip(*matches)
    return list(unprocessed_indices), list(unprocessed_words)
            
# unioned windowing is way too slow.
# doesn't do well with new words bc would just recommend the unigrams
# the mle will assign the discovered words first and then the unigrams anyways. 
def dict_explain_away(vocab,text):
    print len(text)
    # text[:] does not make a new copy
    duplicate = "%s" % text
    #print id(text)
    #print id(duplicate)
    # make a list of disjoint tuples of (start_index, end_index)
    preprocessed_indices = []

    for i in range(len(vocab)-1, 1,-1):
        for key in vocab[i].keys():
            starts = [match.start() for match in re.finditer(re.escape(key), duplicate)]
            for start in starts:
                mask = " "*len(key)
                duplicate = duplicate[:start] + mask + duplicate[start+len(key):]
                preprocessed_indices.append([start, start + len(key)])
    processed_indices = [[pair[0],pair[1]] for pair in preprocessed_indices if pair[0] < pair[1]]
    unprocessed_indices, unprocessed_words = unexplained(vocab, duplicate)
    return sorted(processed_indices), unprocessed_indices, unprocessed_words

"""
how do you choose between sticking a word in a category and with adding a new word. how to choose the threshold.
threshold is if most matching word with ngram is < .5 sd away from ngram dot ngram. 
this means we need to have saved a distribution of word dot word for every word in the model
the model is NOT a similarity matrix. only O(N) space and O(N) time with constant cost per operation
these are all non matched words.
"""
def mle(freqs, words):
    predicted = []
    # need a dot product matrix
    for word in words:
        # need to normalize
        word_vector = random_idx.id_vector(N, word, alphabet, RI_letters, ordered)/float(len(word))
        self_dot = np.dot(word_vector, word_vector.T)
        cluster_size = len(word) # or + 1?
        max_sim = float("-inf")
        max_key = ""
        
        # for optimization, don't go all the way down. go down like 4 letters max
        for i in range(cluster_size, max(cluster_size-5,-1),-1):
            for key in freqs[i].keys():
                key_vector = random_idx.id_vector(N, key, alphabet, RI_letters, ordered)#/float(len(key))
                similarity = np.dot(word_vector, key_vector.T)
                if similarity > max_sim:
                    max_sim = similarity
                    max_key = key
        # random vectors of the same length self dot products. a sample each time
        model_self_similarities = np.zeros(10)
        keys = freqs[cluster_size].keys()[:10]
        for i in range(10):
            wv = random_idx.id_vector(N, keys[i], alphabet, RI_letters, ordered)/float(len(keys[i]))
            model_self_similarities[i] = wv.dot(wv.T)
            
        if abs(self_dot - max_sim) < np.std(model_self_similarities, axis=0):
            #print "categorized"
            freqs[len(max_key)][max_key] += 1
        else:
            #print "new word"
            max_key = word
            freqs[len(word)][word] = 1
        
        
        predicted.append(max_key)
    
    
    return predicted

def predict_text(textname, processed_indices, processed, unprocessed_indices, discovered_words):
    # index in the actual text to index in processed "p" or unprocessed array "u"
    indices = {}
    texti = []
    text = ""
    for i in range(len(processed_indices)):
        indices[processed_indices[i][0]] = [i, "p"]
    for i in range(len(unprocessed_indices)):
        indices[unprocessed_indices[i][0]] = [i, "u"]
    keys = sorted(indices.keys())
    for k in keys:
        pair = indices[k]
        if pair[1] == "p":
            texti.append(processed[pair[0]])
        else:
            texti.append(discovered_words[pair[0]])
        text += texti[len(texti)-1] + " "
    fwrite = open("output/" + textname, "w")
    fwrite.write(text)
    fwrite.close()
    return texti
    
    
def predict(freqs, test_indices):
    for ti in test_indices:   
        words = read_text([ti])
        stream = "".join(words)
        text = " ".join(words)
        fwrite = open("input/" + file_ids[ti], "w")
        fwrite.write(text)
        fwrite.close()
        processed_indices, unprocessed_indices, unprocessed_words = dict_explain_away(freqs,stream)
        processed = post_process(processed_indices, stream)
        discovered_words = mle(freqs, unprocessed_words)
        predict_text(file_ids[ti], processed_indices, processed, unprocessed_indices, discovered_words)
    
    # save data to file
    lvu.write_data_structures([n_gram_frequencies], ["intermediate/n_gram_frequencies"])
    
seed_index = 0
test_indices = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_gram_frequencies, stream, text = seed(seed_index)
predict(n_gram_frequencies, test_indices)

364871
525097
3224222


KeyboardInterrupt: 

In [None]:
file_ids = nltk.corpus.gutenberg.fileids()
inds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for ti in test_indices:   
    words = read_text([ti])
    stream = "".join(words)
    text = " ".join(words)
    fwrite = open("input/" + file_ids[ti], "w")
    fwrite.write(text)
    fwrite.close()