In [59]:
ROOT_FOLDER = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"

DOCS_FOLDER   = "%s/Docs" % ROOT_FOLDER
OUTPUT_FOLDER = "%s/ProcessedDocs" % ROOT_FOLDER
EMPTY_OUTPUT_FOLDER = True

FILE_MASK = ".*\.txt"
PARSE_HTML = False
FILE_SIZE_LIMIT_CHARS = 100

# Dump the Predictions Output File for Processing Corrected Sentences

In [60]:
import os, re
def find_files(folder, regex, remove_empty = False):
    """
    Find all files matching the [regex] pattern in [folder]

    folder  :   string
                    folder to search (not recursive)
    regex   :   string (NOT regex object)
                    pattern to match
    """
    files = os.listdir(folder)
    matches = [os.path.abspath(os.path.join(folder, f))
               for f in files
               if re.search(regex, f, re.IGNORECASE)]

    if remove_empty:
        matches = [f for f in matches if os.path.getsize(f) > 0]
    matches.sort()
    return matches

def delete_files(folder, regex):
    """ Deletes files in [folder] that match [regex] 
        e.g. delete_files("C:/Dice Data/DelTest", ".*\.txt", 30)

        folder      :   string
                            folder to search
        regex       :   string
                            file pattern to match
    """
    matches = find_files(folder, regex)
    for full_path in matches:
        os.remove(full_path)

In [61]:
"""
import pandas as pd
fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
data = pd.read_csv(fname, sep="|")
data = data[["Essay", "Sent Number", "Processed Sentence"]]
data["s_Sent_Num"] = data["Sent Number"].apply(lambda n : str(n).rjust(3,'0'))
data["Num_Sent"] = data["s_Sent_Num"] + "|" + data["Processed Sentence"]
data = data[["Essay", "Num_Sent"]]
data.head()
"""
None

In [62]:
"""
grouped = group_by(data, ["Essay"], [("Num_Sent", lambda strings: "||".join(sorted(strings)))])
grouped.head()

#completely empty docs folder
delete_files(DOCS_FOLDER,".*")

for i in range(len(grouped)):
    row = grouped.iloc[i]
    essay_name = row["Essay"][:-4] + ".txt"
    with open(DOCS_FOLDER + "/" + essay_name, "w+") as f:
        contents = row["Num_Sent"].split("||")
        for item in contents:
            num, sent = item.split("|")
            f.write(sent.strip().lower() + "\n")
"""
None

# Take Processed Output and Dump to Disk

In [63]:
#Shared
import re
re_collapse_spaces = re.compile("\s+")

def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s).strip()

re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
    s = str(s).replace("'s"," ")
    #doesn't work in regex
    s = s.replace("-", " ").replace("\\"," ")
    s = re1.sub(" ",s).strip()
    return collapse_spaces(s)

In [64]:
import os, re, time
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize

REPL = ".\n"

def strip_non_ascii(text):
    return ''.join(i for i in text if ord(i)<128)

# Make common html tags line breaks
def pre_process_text(txt):
    txt = txt.replace("</li><li>", REPL).replace("<li>", REPL).replace("</li>", REPL)
    txt = txt.replace("<br>", REPL)
    txt = txt.replace("<br/>", REPL)
    txt = txt.replace("<br />", REPL)
    txt = txt.replace("<p>",  REPL)
    txt = txt.replace("<p/>",  REPL)
    txt = txt.replace("<p />",  REPL)
    txt = txt.replace("</p>", REPL)
    txt = txt.replace(". .",  REPL)
    txt = txt.replace("&nbsp;", " ")
    while ".." in txt:
        txt = txt.replace("..", ". ")
    while "  " in txt:    
        txt = txt.replace("  ", " ")
    return txt

def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', strip_non_ascii(element)):
        return False
    return True

def get_text(html):
    bs = BeautifulSoup(html)
    texts = bs.findAll(text=True)
    visible_texts = filter(visible, texts)
    return REPL.join(visible_texts)

def parse_html(html):
    txt = get_text(pre_process_text(html))
    return txt

def split_into_sentences(txt):
    txt = strip_non_ascii(txt)
    #sents = map(clean_str,sent_tokenize(txt))
    #This has already been done
    sents = map(clean_str,txt.split("\n"))
    return filter(lambda s: len(s.strip()) > 5, sents)

In [65]:
import ntpath

ntpath.basename("a/b/c")
def get_file_name(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

start = time.time()

if EMPTY_OUTPUT_FOLDER:
    if OUTPUT_FOLDER == DOCS_FOLDER:
        print("ERROR - Can't empty output folder if the same as the input folder")
    else:
        delete_files(OUTPUT_FOLDER,".*")
    
files = find_files(DOCS_FOLDER, FILE_MASK, True)
for i, fpath in enumerate(files):
    with open(fpath) as f:
        contents = f.read()
        if len(contents) < FILE_SIZE_LIMIT_CHARS:
            continue
        if PARSE_HTML:
            contents = parse_html(contents)
            if len(contents) < FILE_SIZE_LIMIT_CHARS:
                continue

        sents = split_into_sentences(contents)
        doc = "\n".join(sents)
        
        file_name = get_file_name(fpath)        
        fout_name = OUTPUT_FOLDER + "/" + file_name.split(".")[0] + "_proc.txt"
        with open(fout_name, "w+") as fout:
            fout.write(doc)
    if i % 1000 == 0:
        print(i)
end = time.time()
print("Loading and processing documents took %s seconds" % str(end - start))

0
1000
Loading and processing documents took 0.599376916885 seconds
