In [7]:
import re
# nltk version 3.4.5  installed with anaconda navigator, alternatively pip install --user -U nltk according to their website
import nltk
import os


# assumes the folder is present in the same folder as this notebook
file_start = '.\\20news-train'
# gets a list of the subfolders
sub_folders = os.listdir(file_start)

# d_matrix will be a dict of dicts.  the outer dict will have a key corresponding with the order the document was read
# (which will match its place in the y_vector) and the value is the inner dict.  This inner dict will have keys of
# words and values of their frequency
d_matrix = {}
y_vector = []
total_files = 0


# this function only adapted from
# https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41
wpt = nltk.WordPunctTokenizer()
nltk.download('stopwords')
# in addition to whitespace and special characters, nltk's stop_words were filtered
stop_words = nltk.corpus.stopwords.words('english')


def normalize_line(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document, assuming this is included in the allowed "off the self tokenizers from libraries like nltk"
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

# goes word by word, checking if it is already in the dictionary.  if yes, increase the frequency count
# if no, add as a new key value pair
def update_dictionary(dictionary, tokens):
    for word in tokens:
        if word in dictionary:
            dictionary[word] += 1
        else:
            dictionary.update({word: 1})

# opens the file at the given path.  goes line by line, normalizing the line, splitting on whitespace, then updating
# the per file dictionary.  both the meta-data and the main body are processed
def process_file(name, file_num):
    with open(name) as f:
        # dictionary to added to d_matrix
        file_dict = {}
        for line in f:
            normed_line_list = normalize_line(line).split()
            update_dictionary(file_dict, normed_line_list)
        d_matrix.update({file_num: file_dict})

# outer for runs the whole list of subfolders, inner for gets each filename in that folder.  creates a string with the whole
# path from this directory to the file, then passes to process_file.  finally, updates the y_vector with the appropriate
# class label as outlined in the hw2 doc
for i in range(sub_folders.__len__()):
    for filename in os.listdir(file_start + '\\' + sub_folders[i]):
        full_path = file_start + '\\' + sub_folders[i] + '\\' + filename
        process_file(full_path, total_files)
        total_files += 1
        # alt.atheism = 1 comp.graphics = 2 etc.
        y_vector.append(i+1)

# final thoughts/changes needed going forward: 
# was not able to get a full m*n matrix. (maybe one day ill stop procrastinating) Possibly keep a set of all previously 
# seen words, insert all these with frequency of 0 into the new file_dicts?  but then going back to the previous 
# dicts and and adding the new words they didn't have with a frequency of 0 seems like a waste of time and memory.  
# Going through all the files once to get a full vocab and then again for frequencies likewise seems costly.  Being 
# able to do row and column sums in a full matrix/pandas dataframe could be nice but could also be accomplished with : 
# sum = 0
# for i in range(d_matrix.__len__()):
#   current_dict = d_matrix[i]
#   if desired_word in current_dict:
#       sum += current_dict[desired_word]
# 
# Aside from this, more work needs to be done with cleaning the raw text.  currently email addresses have the @ and .
# striped and then the whole address is joined together (e.g. ray.476@osu.edu becomes rayosuedu) which i suspect
# will be useless going forward

# these prints give an IOpub data rate exceeded (pycharm could handle it), the hw2 doc does not specifiy what exactly to output, 
# only "how to interpret the output"
# print(d_matrix)
# print(y_vector)

# this was useful to anchor a breakpoint during debugging the explore the variables in the call stack at the end of 
# code execution, not sure if jupyter offers this
print('done')

# p.s. code developed in pycharm but veried to run in this notebook.  sorry if i didnt use markdown cells interlaced with code 
# cells

[nltk_data] Downloading package stopwords to C:\Users\Matthew
[nltk_data]     Ray\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


done
