In [1]:
import urllib
import string
from itertools import compress
from nltk.corpus import stopwords 
import pandas as pd
from functools import reduce


url = 'https://raw.githubusercontent.com/tdhopper/topic-modeling-datasets/master/data/raw/Nematode%20biology%20abstracts/cgcbib.txt'
file = urllib.request.urlopen(url)
data = file.read().decode("ISO-8859-1")

In [2]:
def docsToList(data):
    '''This function takes a string of abstracts and converts it to a list of lists of the words in each abstract.
       This function was made specifically for the data obtained here:
       https://raw.githubusercontent.com/tdhopper/topic-modeling-datasets/master/data/raw/Nematode%20biology%20abstracts/cgcbib.txt'''
    
    # Remove '\n' and '\r'
    data = data.lower().translate(str.maketrans('\n', ' '))
    data = data.translate(str.maketrans('\r', ' '))
    
    # Remove punctuation except for '-' so we can split after each abstract
    data = data.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'))
    
    # Remove numbers
    data = data.translate(str.maketrans('','', string.digits))
    
    # Split after 'abstract' is stated
    data = data.split('-------------------')
    # Remove '-' punctuation now
    data = [abstract.translate(str.maketrans('-', ' ')) for abstract in data]
    
    # Remove entries without the word "abstract" in it
    abs_check = ['abstract' in i for i in data]
    data = list(compress(data, abs_check))

    # Only keep the words after 'abstract'
    data = [abstract.split('abstract')[1] for abstract in data]
    
    # Remove abstracts that only state 'in french'
    not_french = ['in french' not in i for i in data]
    data = list(compress(data, not_french))
    
    # Create list of lists output
    output = [i.split() for i in data]
    
    return output

In [3]:
def reducedVocab(lists, stop_words = None, min_word_count = 10):
    '''This function takes a list of words in a list of documents and returns the lists of lists with a reduced
       vocabulary, the flattened list, and the vocabulary'''
    
    if stop_words == None:
        stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    words = [i for sublist in lists for i in sublist if not i in stop_words]

    # Remove words that appear less than min_word_count times
    wordSeries = pd.Series(words)
    vocab = list(compress(wordSeries.value_counts().index, wordSeries.value_counts() >= min_word_count))
    
    docs = []
    for j in range(len(lists)):
        docs.append([i for i in lists[j] if i in vocab])
    
    one_list = reduce(lambda x, y: x + y, docs, [])
    return docs, one_list, vocab

In [4]:
def listsToVec(lists, stop_words = None, min_word_count = 10):
    '''This function takes a list of lists of the words in each document. It removes any stop words, removes words that
       appear 10 times or less, and maps each word in the documents' vocabulary to a number. Two flattened vectors are
       returned, the mapped numbers 'x', and the corresponding document each word belongs to 'j'.'''

    # Remove stop words and words that appear less than 'min_word_count' times
    docs, one_list, vocab = reducedVocab(lists, stop_words, min_word_count)
    
    # Map each word to a number
    numbers = list(range(len(vocab)))
    vocab_dict = dict(zip(vocab, numbers))
    x = list(map(vocab_dict.get, one_list))
    
    # Check for empty lists and print warning if one is found
    for i in range(len(docs)-1 ,-1, -1):
        if len(docs[i]) == 0:
            print(f'WARNING: Document {i} is empty and being removed...')
            del docs[i]
    
    # Determine which document each word belongs to
    count, j = 0, []
    for i in docs:
        j.append([count]*len(i))
        count += 1
        
    # Reduce to a flattened list
    j = reduce(lambda x, y: x + y, j, [])
    
    return x,j

In [None]:
lists = docsToList(data)
x, j = listsToVec(lists)

## Morris/Quinn

What is the etiquette for using other packages inside my functions? Is there something I need to do to ensure the person has them installed?