In [2]:
import os
import codecs
import re
from string import punctuation

In [33]:
def unicode_apostrophes(doc):
    ''' (str) -> str
    Translates unicode apostrophes to a generic apostrophe.
    '''
    d = {u'\u2019':'\'', u'\u2018':'\'', u'\u201C':'"', u'\u201D':'"'}
    t = str.maketrans(d)
    return doc.translate(t)

In [34]:
def reg_exp_special_periods(doc):
    ''' (str) -> str
    Uses regular expressions to remove periods for special cases. A period is supposed to be a full stop in our context.
    '''
    doc = re.sub(r" \D\. ", ' ', doc) # handles middle initials
    doc = re.sub(r"St\. ", 'St ', doc) # handling some titles
    doc = re.sub(r"Mrs\. ", 'Mrs ', doc)
    doc = re.sub(r"Ms\. ", 'Ms ', doc)
    doc = re.sub(r"Mr\. ", 'Mr ', doc)
    doc = re.sub(r"Dr\. ", 'Dr ', doc)
    doc = re.sub(r"Sr\. ", 'Sr ', doc)
    doc = re.sub(r"Jr\. ", 'Jr ', doc)
    doc = re.sub(r"So\. ", 'sophomore ', doc)
    doc = re.sub(r"Fr\. ", 'freshman ', doc)
    doc = re.sub(r"vs\. ", 'vs ', doc) # handling some Latin
    doc = re.sub(r"et al\. ", 'et al ', doc)
    return doc

In [35]:
def handle_acronyms(doc):
    ''' (str) -> str
    Takes out periods from an acronym.
    '''
    words = doc.split()
    doc = ''
    for i in range(len(words)):
        if words[i].count('.') > 1:
            words[i] = words[i].replace('.','')
        doc = doc + words[i] + ' '
    return doc       

In [36]:
def reg_exp_fixes(doc):
    ''' (str) -> str
    Uses regular expressions to modify some words for Google's Word2Vec model.
    Most substitutions deal with numbers and decimal places.
    '''
    doc = re.sub(r"\((.*?)\)", '', doc) # remove phrases in parentheses
    doc = re.sub(r"\[(.*?)\]", '', doc) # remove phrases in brackets
    doc = re.sub(r"\.\.\.", '. ', doc) 
    doc = re.sub(r"\'s", '', doc) # handles possessives
    doc = re.sub(r"\d+st", ' number ', doc)  # handles numbers
    doc = re.sub(r"\d+rd", ' number ', doc)
    doc = re.sub(r"\d+nd", ' number ', doc)
    doc = re.sub(r"\d+th", ' number ', doc)
    doc = re.sub(r"\d+s", ' number ', doc)
    doc = re.sub(r"\d+\.\d+", ' number ', doc) 
    doc = re.sub(r" \.\d+", ' number ', doc)
    doc = re.sub(r"\d+", ' number ', doc)
    return doc

In [37]:
def fix_punctuation(doc):
    ''' (str) -> str
    Either removes punctuation or modifies it. Only sentence enders (!,?,.) and contractions remain.
    '''
    p = punctuation[1:3] + punctuation[7:12] + punctuation[14:20] + punctuation[22:]
    d = {thing:'' for thing in p} # dictionary
    d.update({'%':' percent ', '$':' dollars ',
              '&':' ampersand ', '@':' at ',
              '-':' ', u'\u2013':' ', u'\u2014':' ',
              '!':' ! ', '?':' ? ', '.':' . '})
    t = str.maketrans(d) # translation table
    
    return doc.translate(t)

In [38]:
def strip_apostrophes(doc):
    ''' (str) -> str
    Removes apostrophes that start or end a word. Does not affect contractions.
    '''
    words = doc.split()
    words = [item.strip('\'') for item in words] # must split into words to strip apostrophes
    
    doc = ''
    for word in words:
        doc = doc + word + ' '
    return doc

In [48]:
def process_punctuation():
    ''' (None) -> None
    Removes punctuation in .txt file that doesn't correspond to a contraction or a full stop.
    Writes new .txt file to a new location in storage.
    Calls strip_apostrophes, fix_punctuation, reg_exp_fixes, handle_acronyms, and reg_exp_special_periods.
    '''
    home = os.path.expanduser('~')
    path = '\\Documents\\RULE\\UppsalaStudentCorpus\\USEtexts\\' # change file path accordingly 
    new_path = '\\Documents\\RULE\\UppsalaStudentCorpus\\USEdata\\' # change file path accordingly
    for file in os.listdir(home + path):
        with open(home + path + file, 'r') as f:
            new_file = open(home + new_path + file, 'w')
            document = unicode_apostrophes(f.read())
            new_file.write(strip_apostrophes(fix_punctuation(handle_acronyms(reg_exp_fixes(reg_exp_special_periods(document))))))
            new_file.close()
        
    return None

In [49]:
process_punctuation()