In [31]:
import urllib
import string
from itertools import compress
from nltk.corpus import stopwords 
import pandas as pd
from functools import reduce
import os
from bs4 import BeautifulSoup,SoupStrainer

In [42]:
directory = os.fsencode('data')
docs = []
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    f = open(f'data/{filename}', 'r')
    data= f.read()
    soup = BeautifulSoup(data)
    contents = soup.findAll('text')
    f.close()
    docs.append(str(contents).split('</text>'))

docs = [i for doc in docs for i in doc]

In [43]:
# split on </dateline> and keep everything after it
docs = list(compress(docs, ['</dateline>' in i for i in docs]))
docs = [i.split('</dateline>')[1] for i in docs]
docs = [i.lower().translate(str.maketrans('\n', ' ')) for i in docs]
docs = [i.translate(str.maketrans('\r', ' ')) for i in docs]
docs = [i.translate(str.maketrans('\x03', ' ')) for i in docs]
docs = [i.translate(str.maketrans('', '', string.punctuation)) for i in docs]
docs = [i.translate(str.maketrans('', '', string.digits)) for i in docs]
docs = [i.replace('said',' ') for i in docs] # another stop word
docs = [i.replace('reuter', ' ') for i in docs]
docs = [i.split() for i in docs]

In [44]:
def reducedVocab(lists, stop_words = None, min_word_count = 10):
    '''This function takes a list of words in a list of documents and returns the lists of lists with a reduced
       vocabulary, the flattened list, and the vocabulary'''
    
    if stop_words == None:
        stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    words = [i for sublist in lists for i in sublist if not i in stop_words]

    # Remove words that appear less than min_word_count times
    wordSeries = pd.Series(words)
    vocab = list(compress(wordSeries.value_counts().index, wordSeries.value_counts() >= min_word_count))
    
    # Recreate lists with filtered vocab
    docs = []
    for j in range(len(lists)):
        docs.append([i for i in lists[j] if i in vocab])
    
    #flatten docs
    one_list = [i for sublist in docs for i in sublist]
    
    return docs, one_list, vocab

def listsToVec(lists, stop_words = None, min_word_count = 10, verbose = 1):
    '''
    This function takes a list of lists of the words in each document. It removes any stop words, removes words that
    appear 'min_word_count' times or less, and maps each word in the documents' vocabulary to a number. 
    Two flattened vectors are returned, the mapped numbers 'x', and the corresponding document each word belongs to 'j'.'''

    # Remove stop words and words that appear less than 'min_word_count' times
    docs, one_list, vocab = reducedVocab(lists, stop_words, min_word_count)
    
    # Map each word to a number
    #numbers = list(range(len(vocab)))
    #vocab_dict = dict(zip(vocab, numbers))
    #x = list(map(vocab_dict.get, one_list))
    
    # Check for empty lists and print warning if one is found
    counter = 0
    for i in range(len(docs)-1 ,-1, -1):
        if len(docs[i]) == 0:
            if verbose > 1:
                print(f'WARNING: Document {i} is empty and being removed...')
            del docs[i]
            counter += 1
    
    if verbose == 1 and counter > 1:
        print(f'WARNING: {counter} documents are empty and being removed...')
    
    elif verbose == 1 and counter == 1:
        print(f'WARNING: {counter} document is empty and being removed...')
    
    X_matrix = pd.DataFrame(np.zeros((len(one_list), len(vocab))),
                           columns=vocab)

    for i, word in enumerate(one_list):
        X_matrix.loc[i, word] = 1   
    
    # Determine which document each word belongs to
    count, j = 0, []
    for i in docs:
        j.append([count]*len(i))
        count += 1
        
    # Reduce to a flattened list
    j = [i for sublist in j for i in sublist]
    
    return X_matrix, j

In [45]:
import numpy as np


# Select random subset of the documents
selected = np.random.choice(len(docs), 110, replace = False)
subset_docs = [docs[i] for i in selected]

In [46]:
# Process subset of documents
docs, one_list, vocab = reducedVocab(subset_docs, min_word_count = 4)

## LDA Models

In [48]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint
import re
import pandas as pd
import numpy as np
import string
import os

In [None]:
def LDA_perplexity(docs, max_train_index, min_topics, max_topics):
    '''
    This function takes a list of lists (words in documents) and the number of documents you want in the training set.
    NOTE : max_train_index must be less than len(docs)
    
    It calculates the perplexity of the trained LDA model on the remaining unseen documents in 'docs' ranging
    from n_topics = min_topics to n_topics = max_topics
    '''
    
    assert max_train_index < len(docs), 'max_train_index must be less than the length of docs'
    assert min_topics <= max_topics, 'min_topics must be less than or equal to max_topics'
    
    train = docs[:max_train_index]
    test = docs[max_train_index:]
    
    train_2word = corpora.Dictionary(train)
    train_corpus = [train_2word.doc2bow(doc) for doc in train]
    
    test_2word = corporta.Dictionary(test)
    test_corpus = [test_2word.doc2bow(doc) for doc in test]
    
    perplexity = []
    for i in range(min_topics, max_topics + 1): 
        
        lda_model = gensim.models.ldamodel.LdaModel(corpus = train_corpus,
                                               id2word = train_2word,
                                               num_topics = i,
                                               random_state = 23,
                                               eval_every = 20,
                                               alpha = 'asymmetric',
                                               iterations = 500)
        
        tmp_perplex = np.exp(lda_model.log_perplexity(test_corpus))
        perplexity.append(tmp_perplex)
        
    return perplexity
    

In [109]:
id2word = corpora.Dictionary(docs[:100])
corpus = [id2word.doc2bow(text) for text in docs[:100]]

In [110]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=23,
                                           eval_every = 20,
                                           alpha='asymmetric', # 1D array of length equal to number of expected topics - expresses a-priori belief for each topics prob
                                           #eta = , # a-prior belief on word probability
                                           iterations = 500)

In [111]:
id2word = corpora.Dictionary(docs[100:])
test_corpus = [id2word.doc2bow(text) for text in docs[100:]]

In [113]:
np.exp(lda_model.log_perplexity(test_corpus))

3.1307833786640772e-09