This notebook contains helper functions for preprocessing the complaints text for the complaint classifier in the main notebook.

In [1]:
import pandas as pd
import numpy as np

#### Get Tokens

In [1]:
# dependencies
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# build stopwords list - common English stopwords and punctuation
stopwords_list = stopwords.words('english')
stopwords_list.extend(string.punctuation)

# define the function
def get_tokens(complaint):
    '''Takes in a string and returns a list of lowercase tokens without punctuation or
    common English stopwords.'''
    return [w.lower() for w in word_tokenize(complaint) if w.lower() not in stopwords_list]

#### Remove Redacted Items from Tokens

In [4]:
# dependency
import re

# define the function
def remove_redactions(tokens):
    '''Takes in a series of tokens by index and returns a series of tokens by index with
    redacted material removed.'''
    # create and compile regular expression pattern
    pattern = '\S*x+x+\S*'
    p = re.compile(pattern)
    
    # find pattern matches for each list of tokens
    regex_series = tokens.map(lambda x: p.findall(' '.join(x)))
    
    # store info for cleaned tokens for each index
    indices = list(tokens.index)
    all_clean_tokens = []

    # iterate over all indices
    for index in indices:
        # create a list of tokens to keep
        clean_tokens = []
        # for each token corresponding to that index
        for token in tokens[index]:
            keep = 0 # assume that we won't keep this token
            # iterate over all redacted items from the list of tokens (skip if list is empty)
            if regex_series[index]:
                for redacted in regex_series[index]:
                    if token == redacted:
                        keep=0 # don't keep it if it's a "redacted" token
                        break
                    else: # keep checking if it matches other redacted patterns
                        keep=1
                        continue
                if keep==1:
                    clean_tokens.append(token)
            else: # if there are no redacted expressions, keep all tokens
                clean_tokens = tokens[index]
                
        # append the list of tokens for this index to the list of cleaned_tokens            
        all_clean_tokens.append(clean_tokens)
    
    # return series of clean tokens
    return pd.Series(dict(zip(indices, all_clean_tokens)))

#### Lemmatize Clean Tokens

In [1]:
# HELPER FUNCTION
# download wordnet & tagsets if you haven't already
import nltk
nltk.download('wordnet')
nltk.download('tagsets')

# dependency for helper function
from nltk.corpus import wordnet

# helper function to get wordnet pos_tags from nltk pos_tags
def get_wordnet_pos(tag):
    '''Convert POS tags generated by nltk.pos_tag to wordnet tags, for use with word net
    lemmatizer provided in the nltk.stem.wordnet package.'''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# MAIN FUNCTION
# dependencies
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

# define function to apply lemmatizer to wordnet-tagged words to get lemmas
def lemmatize(clean_tokens):
    '''Takes in a series of clean tokens by index and returns a series of lemmas by index.'''
    # instantiate word net lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # apply lemmatizer after getting nltk pos tags & converting to wordnet pos tags
    return clean_tokens.map(lambda x: [lemmatizer.lemmatize(*t) for t in\
          [(pair[0], get_wordnet_pos(pair[1])) for pair in nltk.pos_tag(x)]
         ])

# clean_data['lemmas'] = clean_data.clean_tokens.map(
# lambda x: [lemmatizer.lemmatize(*t) for t in\
#           [(pair[0], get_wordnet_pos(pair[1])) for pair in nltk.pos_tag(x)]
#          ])

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to /Users/user/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
