In [1]:
import contractions
import nltk
import pandas as pd
import re
import spacy

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.corpus import stopwords as nltk_stopwords
from spacy.lang.en.stop_words import STOP_WORDS as spacy_stopwords
from unidecode import unidecode

In [2]:
def return_strings(doc):
    """Preliminary cleaning of responses to open-ended questions"""
    text_ = doc.split(",")[1:]
    text_as_string = " ".join(text_).replace("Question: disc_exp; Answer: ", "")
    text_as_string = text_as_string.strip().strip("'").strip('"').replace("\n", " ")
    return text_as_string


def make_df(docs):
    """Create a dataframe while calling return_strings() on text"""
    if "Media Title" in docs[0]:
        docs = docs[1:]
    doc_ids = []
    doc_texts = []
    for doc in docs:
        doc_id = doc.split(",")[0]
        doc_text = return_strings(doc)
        doc_ids.append(doc_id)
        doc_texts.append(doc_text)
    docs_ = list(zip(doc_ids, doc_texts))
    docs_ = [doc_ for doc_ in docs_ if doc_[1]]
    cols_ = ["doc_id", "text"]
    df = pd.DataFrame(docs_, columns=cols_)
    return df


def get_stop_words(nlp):
    """Merge stopword lists from NLTK and spaCy"""
    nltk_sw = set(nltk_stopwords.words("english"))
    stop_words = spacy_stopwords.union(nltk_sw)
    return stop_words


def fix_ordinal_nums(word):
    """Normalize ordinal numbers in strings"""
    ord_num_reg = "\d+[(st)(nd)(rd)(th)]"
    try:
        if any(re.findall(ord_num_reg, word)):
            word = re.sub("[(st)(nd)(rd)(th)]", "", word)
            word = num2words(word, lang="en", to="ordinal")
        return word
    except:
        return word


pos_to_remove = set(["NUM", "PUNCT", "SYM", "X", "CCONJ", "DET", "ADP"])
nlp = spacy.load("en_core_web_sm")
stop_words = get_stop_words(nlp)


def preprocess(response, stop_words=stop_words, nlp=nlp, pos_to_remove=pos_to_remove, keep_pronouns=False):
    """Main function for preprocessing. See Table 1 and description."""
    response = unidecode(response)
    response = " ".join([w for word in response.lower().split() for w in contractions.fix(word).split()])
    response = " ".join([fix_ordinal_nums(word) for word in response.split()])
    response = nlp(response)
    if keep_pronouns:
        response = [word.lemma_ if word.pos_ != "PRON" else word.text for word in response if ((word.text not in stop_words) and (word.pos_ not in pos_to_remove))]
    else:
        response = [word.lemma_ for word in response if ((word.text not in stop_words) and (word.lemma_ not in stop_words) and (word.pos_ not in pos_to_remove))]
    response = " ".join(response)
    response = re.sub("[^a-z]", " ", response)
    response = re.sub("\s+", " ", response)
    if response:
        return response.split()
    return None


def train_ngram_model(corpus_, min_count=5, threshold=100, inc_trigrams=True):
    """Identify ngrams in the corpus and replace them with versions connected by underscores"""
    corpus_ = corpus_[corpus_["cleaned"] != ""]
    original_ = corpus_["original"]
    preprocessed_ = [p.split() for p in corpus_["cleaned"]]
    bigram_model = Phraser(Phrases(preprocessed_, min_count=min_count, threshold=threshold))
    ngrams = bigram_model[preprocessed_]
    ngrams = list(ngrams)
    if inc_trigrams:
        trigram_model = Phraser(Phrases(ngrams, min_count=min_count, threshold=threshold))
        ngrams = trigram_model[ngrams]
        ngrams = list(ngrams)
    corpus_["ngrams"] = ngrams
    for n in ngrams:
        if "_" in ngrams:
            print(n)
    return corpus_

## Effects of Preprocessing Steps (See Table 1 in Paper)

In [3]:
example = "Told 'pregnancy was a choice...not an illness' so shouldn't be allowed to use sick leave for maternity leave."
print(f"Original: {example}\n")

print(f"Lowercase: {example.lower()}\n")

example = " ".join([w for word in example.lower().split() for w in contractions.fix(word).split()])
print(f"Expanded contractions: {example}\n")

print(f"Everything but ngrams: {' '.join(preprocess(example))}")

Original: Told 'pregnancy was a choice...not an illness' so shouldn't be allowed to use sick leave for maternity leave.

Lowercase: told 'pregnancy was a choice...not an illness' so shouldn't be allowed to use sick leave for maternity leave.

Expanded contractions: told 'pregnancy was a choice...not an illness' so should not be allowed to use sick leave for maternity leave.

Everything but ngrams: tell pregnancy choice illness allow use sick leave maternity leave


## Preprocessing the Corpus

In [None]:
docs = open("PMG 2016 Discrimination Data Excerpts All-original data 1.14.20_utf_8.csv", "r", encoding="utf-8").read().split("\n")
df = make_df(docs)
responses = df["text"].values
responses = [response.strip().strip("'").strip('"') for response in responses]

In [None]:
preprocessed = [preprocess(response) for response in responses]

assert len(preprocessed) == len(responses)

In [None]:
media_titles = df["doc_id"].values
IDs = list(range(len(responses)))
d = {"media_title":media_titles, "doc_id":IDs, "original":responses, "cleaned":[" ".join(p) if p else "" for p in preprocessed]}

df = pd.DataFrame(d).drop_duplicates(subset="original", keep="first")

In [None]:
ngrams = train_ngram_model(df, min_count=5, threshold=0.1)
ngrams["text"] = [" ".join(n) for n in ngrams["ngrams"].values]

In [None]:
f = "ngrams_df.csv"
ngrams.to_csv(f)