In [0]:
import os
import string
import pandas as pd
import spacy
import re
from unidecode import unidecode
from collections import defaultdict

In [0]:
en = spacy.load('en')
nlp = spacy.load('en_core_web_sm')

In [0]:
directory = '/Users/jaewonhyun/Downloads/stories/'

In [0]:
genderDirectory = '/Users/jaewonhyun/Downloads/'

In [0]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8-sig')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [0]:
def preprocess_file(filepath, output_path):
    """
    Preprocesses a file by splitting it into sentences and tokenizing it
    """
    files = os.listdir(filepath)
    random.seed(6)
    sample_list = random.sample(files, round(0.01*len(files)))
    
    genderWords = load_doc(genderDirectory+'female_word_file.txt').split('\n')
    genderWords += load_doc(genderDirectory+'male_word_file.txt').split('\n')
    genderWords = [ w.lower() for w in genderWords if len(w)>0 ]
    unique_words = set(['her', 'his', 'him', 'hers'])
    herhishim = {'her':{'dobj': 'him', 'poss': 'his'}}
    hisherhers = {'his':{'attr': 'hers', 'poss': 'her'}}
    hers = {'hers':'his'}
    him = {'him':'her'}
    
    all_noun_pairs = load_doc(genderDirectory+'noun_pairs.txt').split('\n')
    female_male_noun_pairs = {}
    male_female_noun_pairs = {}
    all_pairs = set()
    for pair in all_noun_pairs:
        female_male = pair.split(", ")
        female_male_noun_pairs[female_male[0].lower()] = female_male[1].lower()
        male_female_noun_pairs[female_male[1].lower()] = female_male[0].lower()
        all_pairs.add(female_male[0].lower())
        all_pairs.add(female_male[1].lower())
    
    all_pairs.add("hers")
    all_pairs.add("her")
    all_pairs.add("him")
    all_pairs.add("his")
    
    sentences = []
    for file in sample_list:
        # Open file
        try:
            with open(filepath+file, 'r') as f:
                text = f.read()
        except UnicodeDecodeError as e:
            try:
                # Account for some files that may be encoded with ISO-8859-1
                with open(file, 'r', encoding='iso-8859-1') as f:
                    text = f.read()
            except UnicodeDecodeError as e:
                msg = "Could not open {}: {}".format(file, str(e))
                raise Exception(msg)

        # Remove any additional information e.g. "@highlights"
        main_text_body = text.split('\n@')[0]
        
        # Split up lines, and then break up lines into sentences
        
        for line in main_text_body.split('\n\n'):
            sentences += list(en(line.strip('\n')).sents)
            
        special_sentences = []
        rest = []
        
    sentences = [sentence.text for sentence in sentences]
    sentences = [sentence.replace("\n","") for sentence in sentences]
    sentences = [sentence.replace("\xa0","") for sentence in sentences]
    sentences = [sentence.replace(r"\u200","") for sentence in sentences]
    sentences = set([sentence.lower() for sentence in sentences])
        
    for sentence in sentences:
        if sentence:
            found = False
            found_word = ""
            for word in genderWords:
                if word in sentence.lower():
                    found = True
                    found_word = word
                    break
                
            if found:
                special_sentences.append(sentence)
            else:
                rest.append(sentence)
                    
    print(special_sentences)
    swapped = []
    for current in rest:
        sentence = current[:]
        checking = False
        labeled = nlp(sentence)
        ents = set()
        herhishimhers = {}
        
        index = 0
        for ent in labeled:
            if ent.pos_ == "PROPN":
                ents.add(ent)
            if ent.text in unique_words:
                herhishimhers[index] = ent.dep_
            if ent.text not in string.punctuation:
                index+=1
        
        split = sentence.split()
        for i in range(len(split)):
            if split[i] in all_pairs:
                if i < len(split)-1 and split[i+1] in ents:
                    continue
                else:
                    if split[i].lower() in herhishimhers:
                        dep = herhishimhers[i]
                        if split[i].lower() in herhishim:
                            if split[i].istitle():
                                split[i] = herhishim['her'][dep].capitalize()
                            else:
                                split[i] = herhishim['her'][dep]
                        elif split[i].lower() in hisherhers:
                            if split[i].istitle():
                                split[i] = hisherhers['his'][dep].capitalize()
                            else:
                                split[i] = hisherhers['his'][dep]
                        elif split[i].lower() in hers:
                            if split[i].istitle():
                                split[i] = hers[split[i].lower()].capitalize()
                            else:
                                split[i] = hers[split[i].lower()]
                        elif split[i].lower() in him:
                            if split[i].istitle():
                                splist[i] = him[split[i].lower()].capitalize()
                            else:
                                split[i] = him[split[i].lower()]
                    elif split[i].lower() in female_male_noun_pairs:
                        if split[i].istitle():
                            split[i] = female_male_noun_pairs[split[i].lower()].capitalize()
                        else:
                            split[i] = female_male_noun_pairs[split[i].lower()]
                    elif split[i].lower() in male_female_noun_pairs:
                        if split[i].istitle():
                            split[i] = male_female_noun_pairs[split[i].lower()].capitalize()
                        else:
                            split[i] = male_female_noun_pairs[split[i].lower()]
        newsentence = " ".join(split)
        newsentence = newsentence.capitalize()

        swapped.append(newsentence)
            
    res = swapped + special_sentences + rest
    
    with open("/Users/jaewonhyun/Downloads/output.txt", 'w') as f:
        for item in rest:
            f.write(item+"\n")

    return res

In [0]:
df = preprocess_file(directory, genderDirectory)

In [0]:
example = "prostate cancer"
sentence = "he has prostate cancer"
if example in sentence:
    print(True)

True
