In [1]:
import string
import re
import pandas as pd
import numpy as np
import os
import urllib
import string
from itertools import compress
from nltk.corpus import stopwords 
from functools import reduce
from bs4 import BeautifulSoup

In [2]:
def reducedVocab(lists, stop_words = None, min_word_count = 10):
    '''This function takes a list of words in a list of documents and returns the lists of lists with a reduced
       vocabulary, the flattened list, and the vocabulary'''
    
    if stop_words == None:
        stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    words = [i for sublist in lists for i in sublist if not i in stop_words]

    # Remove words that appear less than min_word_count times
    wordSeries = pd.Series(words)
    vocab = list(compress(wordSeries.value_counts().index, wordSeries.value_counts() >= min_word_count))
    
    # Recreate lists with filtered vocab
    docs = []
    for j in range(len(lists)):
        docs.append([i for i in lists[j] if i in vocab])
    
    #flatten docs
    one_list = [i for sublist in docs for i in sublist]
    
    return docs, one_list, vocab

def listsToVec(lists, stop_words = None, min_word_count = 10, verbose = 1):
    '''
    This function takes a list of lists of the words in each document. It removes any stop words, removes words that
    appear 'min_word_count' times or less, and maps each word in the documents' vocabulary to a number. 
    Two flattened vectors are returned, the mapped numbers 'x', and the corresponding document each word belongs to 'j'.'''

    # Remove stop words and words that appear less than 'min_word_count' times
    docs, one_list, vocab = reducedVocab(lists, stop_words, min_word_count)
    
    # Map each word to a number
    #numbers = list(range(len(vocab)))
    #vocab_dict = dict(zip(vocab, numbers))
    #x = list(map(vocab_dict.get, one_list))
    
    # Check for empty lists and print warning if one is found
    counter = 0
    for i in range(len(docs)-1 ,-1, -1):
        if len(docs[i]) == 0:
            if verbose > 1:
                print(f'WARNING: Document {i} is empty and being removed...')
            del docs[i]
            counter += 1
    
    if verbose == 1 and counter > 1:
        print(f'WARNING: {counter} documents are empty and being removed...')
    
    elif verbose == 1 and counter == 1:
        print(f'WARNING: {counter} document is empty and being removed...')
    
    X_matrix = pd.DataFrame(np.zeros((len(one_list), len(vocab))),
                           columns=vocab)

    for i, word in enumerate(one_list):
        X_matrix.loc[i, word] = 1   
    
    # Determine which document each word belongs to
    count, j = 0, []
    for i in docs:
        j.append([count]*len(i))
        count += 1
        
    # Reduce to a flattened list
    j = [i for sublist in j for i in sublist]
    
    return X_matrix, j

In [18]:
directory = os.fsencode('../data')
docs = []
for file in os.listdir(directory):
    root = directory.decode('ascii')
    filename = os.fsdecode(file)
    f = open(f'{root}/{filename}', 'r')
    data= f.read()
    soup = BeautifulSoup(data)
    contents = soup.findAll('text')
    f.close()
    docs.append(str(contents).split('</text>'))

docs = [i for doc in docs for i in doc]

SyntaxError: invalid syntax (<ipython-input-18-718fe8b63736>, line 7)

In [10]:
# split on </dateline> and keep everything after it
docs = list(compress(docs, ['</dateline>' in i for i in docs]))
docs = [i.split('</dateline>')[1] for i in docs]
docs = [i.lower().translate(str.maketrans('\n', ' ')) for i in docs]
docs = [i.translate(str.maketrans('\r', ' ')) for i in docs]
docs = [i.translate(str.maketrans('\x03', ' ')) for i in docs]
docs = [i.translate(str.maketrans('', '', string.punctuation)) for i in docs]
docs = [i.translate(str.maketrans('', '', string.digits)) for i in docs]
docs = [i.replace('said',' ') for i in docs] # another stop word
docs = [i.replace('reuter', ' ') for i in docs] # the name of the company at the end of most articles
docs = [i.split() for i in docs]

In [15]:
directory.decode('ascii')

'../data/'

In [20]:
f'{directory}'

"b'../data/'"