In [1]:
import urllib
import string
from itertools import compress
from nltk.corpus import stopwords 
import pandas as pd
from functools import reduce
import numpy as np


url = 'https://raw.githubusercontent.com/tdhopper/topic-modeling-datasets/master/data/raw/Nematode%20biology%20abstracts/cgcbib.txt'
file = urllib.request.urlopen(url)
data = file.read().decode("ISO-8859-1")

In [2]:
def docsToList(data):
    '''
    This function takes a string of abstracts and converts it to a list of lists of the words in each abstract.
    This function was made specifically for the data obtained here:
    https://raw.githubusercontent.com/tdhopper/topic-modeling-datasets/master/data/raw/Nematode%20biology%20abstracts/cgcbib.txt
    '''
    
    # Remove '\n' and '\r'
    data = data.lower().translate(str.maketrans('\n', ' '))
    data = data.translate(str.maketrans('\r', ' '))
    
    # Remove punctuation except for '-' so we can split after each abstract
    data = data.translate(str.maketrans('', '', '!"#$%&\'()*+,./;<=>?@[\\]^_`{|}~'))
    
    # Remove numbers
    data = data.translate(str.maketrans('','', string.digits))
    
    # Split after 'abstract' is stated
    data = data.split('-------------------')
    # Remove '-' punctuation now
    data = [abstract.translate(str.maketrans('-', ' ')) for abstract in data]
    
    # Remove entries without the word "abstract" in it
    abs_check = ['abstract' in i for i in data]
    data = list(compress(data, abs_check))

    # Only keep the words after 'abstract'
    data = [abstract.split('abstract:')[1] for abstract in data]
    
    # Remove any remaining :'s
    data = [abstract.translate(str.maketrans(':', ' ')) for abstract in data]
    
    # Remove abstracts that only state 'in french'
    not_french = ['in french' not in i for i in data]
    data = list(compress(data, not_french))
    
    # Create list of lists output
    output = [i.split() for i in data]
    
    return output

In [3]:
def reducedVocab(lists, stop_words = None, min_word_count = 10):
    '''
    This function takes a list of words in a list of documents and returns the lists of lists with a reduced
    vocabulary, the flattened list, and the vocabulary
    '''
    
    if stop_words == None:
        stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    words = [i for sublist in lists for i in sublist if not i in stop_words]

    # Remove words that appear less than min_word_count times
    wordSeries = pd.Series(words)
    vocab = list(compress(wordSeries.value_counts().index, wordSeries.value_counts() >= min_word_count))
    
    # Recreate lists with filtered vocab
    docs = []
    for j in range(len(lists)):
        docs.append([i for i in lists[j] if i in vocab])
    
    #flatten docs
    one_list = [i for sublist in docs for i in sublist]
    
    return docs, one_list, vocab

In [4]:
def listsToVec(lists, stop_words = None, min_word_count = 10, verbose = 1):
    '''
    This function takes a list of lists of the words in each document. It removes any stop words, removes words that
    appear 'min_word_count' times or less, and maps each word in the documents' vocabulary to a number. 
    Returns: data matrix X, where each row is a draw from a categorical distribution representing one word
             vector j encoding the corresponding documents each word belongs to'''

    # Remove stop words and words that appear less than 'min_word_count' times
    docs, one_list, vocab = reducedVocab(lists, stop_words, min_word_count)
    
    # Map each word to a number
    #numbers = list(range(len(vocab)))
    #vocab_dict = dict(zip(vocab, numbers))
    #x = list(map(vocab_dict.get, one_list))
    
    # Check for empty lists and print warning if one is found
    counter = 0
    for i in range(len(docs)-1 ,-1, -1):
        if len(docs[i]) == 0:
            if verbose > 1:
                print(f'WARNING: Document {i} is empty and being removed...')
            del docs[i]
            counter += 1
    
    if verbose == 1 and counter > 1:
        print(f'WARNING: {counter} documents are empty and being removed...')
    
    elif verbose == 1 and counter == 1:
        print(f'WARNING: {counter} document is empty and being removed...')
    
    X_matrix = pd.DataFrame(np.zeros((len(one_list), len(vocab))),
                           columns=vocab)

    for i, word in enumerate(one_list):
        X_matrix.loc[i, word] = 1   
    
    # Determine which document each word belongs to
    count, j = 0, []
    for i in docs:
        j.append([count]*len(i))
        count += 1
        
    # Reduce to a flattened list
    j = [i for sublist in j for i in sublist]
    
    return X_matrix, np.array(j)

In [8]:
lists = docsToList(data)
x, j = listsToVec(lists[:100], min_word_count = 1)



In [10]:
print(x.shape)
print(np.array(j))

(7413, 2624)


AttributeError: 'list' object has no attribute 'shape'

## Morris/Quinn

What is the etiquette for using other packages inside my functions? Is there something I need to do to ensure the person has them installed?

### Optimization of reducedVocab funcion

Originally the function `reducedVocab` runs on our dataset in Wall time: 3 min 6s

Currently the function `reducedVocab` runs in Wall time: 1min 56s

Comparatively, gensim's `corpora.Dictionary` runs in 1.17s and the `doc2bow` for text in docs runs in 1.02s.

Our function would run in 2s if it did not have to filter the vocabulary

In [None]:
stop_words = set(stopwords.words('english'))
%time words = [i for sublist in lists for i in sublist if not i in stop_words]

In [None]:
# Old way (1min 23s)
%time j = reduce(lambda x, y: x + y, docs, [])

# New way (107ms)
%time [i for sublist in docs for i in sublist]

In [None]:
%time wordSeries = pd.Series(words)
%time vocab = list(compress(wordSeries.value_counts().index, wordSeries.value_counts() >= 10))

In [None]:
count, j = 0, []
docs = []

# Old way (2min 7s)
#%time for j in range(len(lists)): docs.append([i for i in lists[j] if i in vocab])
    
# New way... map all words to a number, turn into a numpy array, compress it with a mask and convert back to words?
dict(zip(set(words), range(len(set(words)))))

## LDA for this project

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint
import re
import pandas as pd
import numpy as np

%time docs, one_list, vocab = reducedVocab(lists)

In [None]:
%time id2word = corpora.Dictionary(docs)

In [None]:
%time corpus = [id2word.doc2bow(text) for text in docs]

In [None]:
import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=23,
                                           update_every=1,
                                           eval_every = 20,
                                           chunksize=100,
                                           passes=20,
                                           alpha='asymmetric', # 1D array of length equal to number of expected topics - expresses a-priori belief for each topics prob
                                           #eta = , # a-prior belief on word probability
                                           per_word_topics=True,
                                           iterations = 500)

In [None]:
import re
import matplotlib.pyplot as plt
p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
matches = [p.findall(l) for l in open('gensim.log')]
matches = [m for m in matches if len(m) > 0]
tuples = [t[0] for t in matches]
perplexity = [float(t[1]) for t in tuples]
liklihood = [float(t[0]) for t in tuples]
iter = list(range(0,len(tuples)*10,10))
plt.plot(iter,liklihood,c="black")
plt.ylabel("log liklihood")
plt.xlabel("iteration")
plt.title("Topic Model Convergence")
plt.grid()
plt.savefig("convergence_liklihood.pdf")
plt.close()

This has lots of problems... 
- The log perplexity method doesn't return 'perplexity', need to calculate that on my own. 
- Need to implement 10-fold cross-validation
- Mixture component cardinalities ranging from 10 to 120
- Need to figure out how to incorporate the symmetric Dirichlet distribution with parameters 
    of .5 for the prior H over topic distributions
- Distribution over topics in LDA was assumed to be symmetric Dirichlet w 
    parameters $\alpha_0/L$

In [None]:
lda_model.log_perplexity()