In [1]:
import spacy

In [2]:
def batch_preprocess_texts(
    texts,
    nlp=None,
    remove_stopwords=True,
    remove_punct=True,
    use_lemmas=False,
    disable=["ner"],
    batch_size=50,
    n_process=-1,
):
    """Efficiently preprocess a collection of texts using nlp.pipe()

    Args:
        texts (collection of strings): collection of texts to process (e.g. df['text'])
        nlp (spacy pipe), optional): Spacy nlp pipe. Defaults to None; if None, it creates a default 'en_core_web_sm' pipe.
        remove_stopwords (bool, optional): Controls stopword removal. Defaults to True.
        remove_punct (bool, optional): Controls punctuation removal. Defaults to True.
        use_lemmas (bool, optional): lemmatize tokens. Defaults to False.
        disable (list of strings, optional): named pipeline elements to disable. Defaults to ["ner"]: Used with nlp.pipe(disable=disable)
        batch_size (int, optional): Number of texts to process in a batch. Defaults to 50.
        n_process (int, optional): Number of CPU processors to use. Defaults to -1 (meaning all CPU cores).

    Returns:
        list of tokens
    """
    # from tqdm.notebook import tqdm
    from tqdm import tqdm

    if nlp is None:
        nlp = spacy.load("en_core_web_sm")

    processed_texts = []

    for doc in tqdm(nlp.pipe(texts, disable=disable, batch_size=batch_size, n_process=n_process)):
        tokens = []
        for token in doc:
            # Check if should remove stopwords and if token is stopword
            if (remove_stopwords == True) and (token.is_stop == True):
                # Continue the loop with the next token
                continue

            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_punct == True):
                continue

            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_space == True):
                continue

            
            ## Determine final form of output list of tokens/lemmas
            if use_lemmas:
                tokens.append(token.lemma_.lower())
            else:
                tokens.append(token.text.lower())

        processed_texts.append(tokens)
    return processed_texts

In [3]:
# Define sample text
sample_text = "While running in Central Park, I noticed that the constant buzzing of flies was annoying. I don't like flies, but I couldn't be too upset as they were likely attracted to the McDonald's food that someone carelessly dropped. I wondered, 'How can they be so uncaring?'"
sample_text

"While running in Central Park, I noticed that the constant buzzing of flies was annoying. I don't like flies, but I couldn't be too upset as they were likely attracted to the McDonald's food that someone carelessly dropped. I wondered, 'How can they be so uncaring?'"

In [4]:
# Do not remove stopwords
tokens_keep_all_stop = batch_preprocess_texts(sample_text, remove_stopwords = False)
print(tokens_keep_all_stop)

266it [00:08, 32.54it/s]

[['w'], ['h'], ['i'], ['l'], ['e'], [], ['r'], ['u'], ['n'], ['n'], ['i'], ['n'], ['g'], [], ['i'], ['n'], [], ['c'], ['e'], ['n'], ['t'], ['r'], ['a'], ['l'], [], ['p'], ['a'], ['r'], ['k'], [], [], ['i'], [], ['n'], ['o'], ['t'], ['i'], ['c'], ['e'], ['d'], [], ['t'], ['h'], ['a'], ['t'], [], ['t'], ['h'], ['e'], [], ['c'], ['o'], ['n'], ['s'], ['t'], ['a'], ['n'], ['t'], [], ['b'], ['u'], ['z'], ['z'], ['i'], ['n'], ['g'], [], ['o'], ['f'], [], ['f'], ['l'], ['i'], ['e'], ['s'], [], ['w'], ['a'], ['s'], [], ['a'], ['n'], ['n'], ['o'], ['y'], ['i'], ['n'], ['g'], [], [], ['i'], [], ['d'], ['o'], ['n'], [], ['t'], [], ['l'], ['i'], ['k'], ['e'], [], ['f'], ['l'], ['i'], ['e'], ['s'], [], [], ['b'], ['u'], ['t'], [], ['i'], [], ['c'], ['o'], ['u'], ['l'], ['d'], ['n'], [], ['t'], [], ['b'], ['e'], [], ['t'], ['o'], ['o'], [], ['u'], ['p'], ['s'], ['e'], ['t'], [], ['a'], ['s'], [], ['t'], ['h'], ['e'], ['y'], [], ['w'], ['e'], ['r'], ['e'], [], ['l'], ['i'], ['k'], ['e'], ['l'], ['y'],




In [5]:
# Remove default stopwords
tokens_remove_default_stop = batch_preprocess_texts(sample_text)
print(tokens_remove_default_stop)

266it [00:08, 32.99it/s]

[['w'], ['h'], [], ['l'], ['e'], [], ['r'], ['u'], ['n'], ['n'], [], ['n'], ['g'], [], [], ['n'], [], ['c'], ['e'], ['n'], ['t'], ['r'], [], ['l'], [], ['p'], [], ['r'], ['k'], [], [], [], [], ['n'], ['o'], ['t'], [], ['c'], ['e'], ['d'], [], ['t'], ['h'], [], ['t'], [], ['t'], ['h'], ['e'], [], ['c'], ['o'], ['n'], ['s'], ['t'], [], ['n'], ['t'], [], ['b'], ['u'], ['z'], ['z'], [], ['n'], ['g'], [], ['o'], ['f'], [], ['f'], ['l'], [], ['e'], ['s'], [], ['w'], [], ['s'], [], [], ['n'], ['n'], ['o'], ['y'], [], ['n'], ['g'], [], [], [], [], ['d'], ['o'], ['n'], [], ['t'], [], ['l'], [], ['k'], ['e'], [], ['f'], ['l'], [], ['e'], ['s'], [], [], ['b'], ['u'], ['t'], [], [], [], ['c'], ['o'], ['u'], ['l'], ['d'], ['n'], [], ['t'], [], ['b'], ['e'], [], ['t'], ['o'], ['o'], [], ['u'], ['p'], ['s'], ['e'], ['t'], [], [], ['s'], [], ['t'], ['h'], ['e'], ['y'], [], ['w'], ['e'], ['r'], ['e'], [], ['l'], [], ['k'], ['e'], ['l'], ['y'], [], [], ['t'], ['t'], ['r'], [], ['c'], ['t'], ['e'], ['d']




In [6]:
# Looop to find words removed
removed_tokens = []
for token in tokens_keep_all_stop:
    if token not in tokens_remove_default_stop:
        removed_tokens.append(token)
removed_tokens

[['i'],
 ['i'],
 ['i'],
 ['a'],
 ['a'],
 ['i'],
 ['i'],
 ['a'],
 ['a'],
 ['i'],
 ['i'],
 ['a'],
 ['a'],
 ['i'],
 ['i'],
 ['i'],
 ['i'],
 ['i'],
 ['a'],
 ['i'],
 ['a'],
 ['a'],
 ['a'],
 ['a'],
 ['a'],
 ['i'],
 ['a'],
 ['a'],
 ['i']]

In [7]:
# Define custom nlp pipeline
custom_nlp = spacy.load('en_core_web_sm')
# Let's start by accessing spaCy's default stopwords
spacy_stopwords = custom_nlp.Defaults.stop_words
spacy_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [8]:
# How many default stopwords?
len(spacy_stopwords)

326

In [9]:
# We can include additional stopwords by adding them to the default set
# Add custom stopwords
custom_stopwords = ["food", "likely",'upset','carelessly']
for word in custom_stopwords:
    # Add the word to the list of stopwords (for easily tracking stopwords)
    custom_nlp.Defaults.stop_words.add(word)
    # Set the is_stop attribute for the word in the vocab dict to true. 
    # this is what will actually determine spacy treating the word as a stop word
    custom_nlp.vocab[word].is_stop = True
updated_spacy_stopwords = custom_nlp.Defaults.stop_words
len(updated_spacy_stopwords)

330

In [10]:
# Remove stopwords
remove_stopwords = ["but", "someone"]
for word in remove_stopwords:
    custom_nlp.Defaults.stop_words.discard(word)
    # Ensure the words are not recognized as stopwords
    custom_nlp.vocab[word].is_stop = False
updated_spacy_stopwords = custom_nlp.Defaults.stop_words
len(updated_spacy_stopwords)

328

In [11]:
# Process text with custom nlp pipeline
custom_stopwords_removed = batch_preprocess_texts(sample_text, nlp = custom_nlp)
print(custom_stopwords_removed)

266it [00:08, 32.90it/s]

[['w'], ['h'], [], ['l'], ['e'], [], ['r'], ['u'], ['n'], ['n'], [], ['n'], ['g'], [], [], ['n'], [], ['c'], ['e'], ['n'], ['t'], ['r'], [], ['l'], [], ['p'], [], ['r'], ['k'], [], [], [], [], ['n'], ['o'], ['t'], [], ['c'], ['e'], ['d'], [], ['t'], ['h'], [], ['t'], [], ['t'], ['h'], ['e'], [], ['c'], ['o'], ['n'], ['s'], ['t'], [], ['n'], ['t'], [], ['b'], ['u'], ['z'], ['z'], [], ['n'], ['g'], [], ['o'], ['f'], [], ['f'], ['l'], [], ['e'], ['s'], [], ['w'], [], ['s'], [], [], ['n'], ['n'], ['o'], ['y'], [], ['n'], ['g'], [], [], [], [], ['d'], ['o'], ['n'], [], ['t'], [], ['l'], [], ['k'], ['e'], [], ['f'], ['l'], [], ['e'], ['s'], [], [], ['b'], ['u'], ['t'], [], [], [], ['c'], ['o'], ['u'], ['l'], ['d'], ['n'], [], ['t'], [], ['b'], ['e'], [], ['t'], ['o'], ['o'], [], ['u'], ['p'], ['s'], ['e'], ['t'], [], [], ['s'], [], ['t'], ['h'], ['e'], ['y'], [], ['w'], ['e'], ['r'], ['e'], [], ['l'], [], ['k'], ['e'], ['l'], ['y'], [], [], ['t'], ['t'], ['r'], [], ['c'], ['t'], ['e'], ['d']




In [12]:
# List of contractions to keep as single tokens
contractions = ["don't", "couldn't"]
# Loop through the contractions list and add special cases
for contraction in contractions:
    special_case = [{"ORTH": contraction}]
    custom_nlp.tokenizer.add_special_case(contraction, special_case)
keep_contractions = preprocess_texts(sample_text, nlp = custom_nlp)
print(keep_contractions)

NameError: name 'preprocess_texts' is not defined

In [13]:
def make_custom_nlp(
    disable=["ner"],
    contractions=["don't", "can't", "couldn't", "you'd", "I'll"],
    stopwords_to_add=[],
    stopwords_to_remove=[],
    spacy_model = "en_core_web_sm"
):
    """Returns a custom spacy nlp pipeline.
    
    Args:
        disable (list, optional): Names of pipe components to disable. Defaults to ["ner"].
        contractions (list, optional): List of contractions to add as special cases. Defaults to ["don't", "can't", "couldn't", "you'd", "I'll"].
        stopwords_to_add(list, optional): List of words to set as stopwords (word.is_stop=True)
        stopwords_to_remove(list, optional): List of words to remove from stopwords (word.is_stop=False)
        spacy_model(string, optional): String to select a spacy language model. (Defaults to "en_core_web_sm".)
                            Additional Options:  "en_core_web_md", "en_core_web_lg"; 
                            (Must first download the model by name in the terminal:
                            e.g.  "python -m spacy download en_core_web_lg" )
            
    Returns:
        nlp pipeline: spacy pipeline with special cases and updated nlp.Default.stopwords
    """
    # Load the English NLP model
    nlp = spacy.load(spacy_model, disable=disable)
    
    # Adding Special Cases 
    # Loop through the contractions list and add special cases
    for contraction in contractions:
        special_case = [{"ORTH": contraction}]
        nlp.tokenizer.add_special_case(contraction, special_case)
    
    # Adding stopwords
    for word in stopwords_to_add:
        # Set the is_stop attribute for the word in the vocab dict to true.
        nlp.vocab[
            word
        ].is_stop = True  # this determines spacy's treatmean of the word as a stop word
        # Add the word to the list of stopwords (for easily tracking stopwords)
        nlp.Defaults.stop_words.add(word)
    
    # Removing Stopwords
    for word in stopwords_to_remove:
        
        # Ensure the words are not recognized as stopwords
        nlp.vocab[word].is_stop = False
        nlp.Defaults.stop_words.discard(word)
        
    return nlp

In [14]:
# Customize the nlp pipeline
function_nlp = make_custom_nlp(    
    disable=['ner', 'parser'],
    contractions=["don't"],
    stopwords_to_add=['park'],
    stopwords_to_remove=['while'],
    spacy_model = "en_core_web_sm"
)
# call preprocessing function with custom nlp pipeline
tokens = batch_preprocess_texts([sample_text], nlp = function_nlp)
print(tokens[0])

1it [00:08,  8.15s/it]

['while', 'running', 'central', 'noticed', 'constant', 'buzzing', 'flies', 'annoying', "don't", 'like', 'flies', 'but', 'attracted', 'mcdonald', 'someone', 'dropped', 'wondered', 'uncaring']



