In [3]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[apple]'
!python -m spacy download en_core_web_sm

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting setuptools
  Using cached setuptools-69.2.0-py3-none-any.whl (821 kB)
Collecting wheel
  Using cached wheel-0.43.0-py3-none-any.whl (65 kB)
Installing collected packages: wheel, setuptools, pip
  Attempting uninstall: wheel
    Found existing installation: wheel 0.38.4
    Uninstalling wheel-0.38.4:
      Successfully uninstalled wheel-0.38.4
  Attempting uninstall: setuptools
    Found existing installation: setuptools 66.0.0
    Uninstalling setuptools-66.0.0:
      Successfully uninstalled setuptools-66.0.0
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x1380794e0>

In [5]:
# Steps in the pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
# Disable ner
nlp_no_ner = spacy.load("en_core_web_sm", disable=['ner'])
# Print active components
nlp_no_ner.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [7]:
# define text for demonstration
sample_text = "While running in Central Park, \nI noticed a discarded McDonald's container,surounded by buzzing flies was annoying."
print(sample_text)

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


In [8]:
# Create a doc with the nlp pipeline
doc = nlp(sample_text)
type(doc)

spacy.tokens.doc.Doc

In [9]:
print(doc)
doc

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.

In [10]:
# Doc objects contain Token objects

In [11]:
# Printing the first 10 tokens separately
for token in doc:
    print(token)

While
running
in
Central
Park
,


I
noticed
a
discarded
McDonald
's
container
,
surounded
by
buzzing
flies
was
annoying
.


In [12]:
# Slicing a token from the doc
token = doc[1]
token

running

In [None]:
#a)token.text: The original form of the word.

In [13]:
print(token.text)

running


In [None]:
#b) token.lemma_: The base or root form of the word.

In [14]:
print(token.lemma_)

run


In [None]:
#c) token.pos_: The part-of-speech tag associated with the token.

In [15]:
print(token.pos_)

VERB


In [None]:
#d) token.is_stop: Boolean flag to check if the token is a stop word.

In [16]:
print(token.is_stop)

False


In [None]:
#e) token.is_punct: Boolean flag to check if the token is punctuation.

In [17]:
print(token.is_punct)

False


In [None]:
#f) token.is_space: Boolean flag to check if the token is a whitespace character (.e.g new line "\n").

In [18]:
print(token.is_space)

False


In [19]:
import pandas as pd
# Create dictionary for desired attributes for each token
token_data = []
for token in doc:
    token_dict = {
        ".text": token.text,
        ".lemma_": token.lemma_,
        ".pos_": token.pos_,
        ".is_stop": token.is_stop,
        ".is_punct": token.is_punct,
        ".is_space": token.is_space
    }
    token_data.append(token_dict)
# Save dictionary as a dataframe
spacy_df = pd.DataFrame(token_data) 
spacy_df.head(10)

Unnamed: 0,.text,.lemma_,.pos_,.is_stop,.is_punct,.is_space
0,While,while,SCONJ,True,False,False
1,running,run,VERB,False,False,False
2,in,in,ADP,True,False,False
3,Central,Central,PROPN,False,False,False
4,Park,Park,PROPN,False,False,False
5,",",",",PUNCT,False,True,False
6,\n,\n,SPACE,False,False,True
7,I,I,PRON,True,False,False
8,noticed,notice,VERB,False,False,False
9,a,a,DET,True,False,False


In [20]:
# Preprocessing with SpaCy

# For loop to remove stopwords
cleaned_tokens = []
# For each token 
for token in doc:
    
    # If the token is a stopword, skip it
    if token.is_stop == True:
        continue 
    
    # Otherwise,
    else: 
        # keep the tokens'.text for the final list of tokens
        cleaned_tokens.append(token.text.lower())
print(cleaned_tokens)

['running', 'central', 'park', ',', '\n', 'noticed', 'discarded', 'mcdonald', 'container', ',', 'surounded', 'buzzing', 'flies', 'annoying', '.']


In [21]:
## Adding onto our preprocessing for loop
# For loop to remove stopwords & punctuation
cleaned_tokens = []
# For each token 
for token in doc:
    
    # If the token is a stopword,
    if token.is_stop == True:
        # skip it and move onto next token
        continue 
    ##NEW: 
    # if the token is punctuation,
    if token.is_punct == True:
        # skip it and move onto next token
        continue
    # if the token is a whitespace  (spaces, new lines, etc)
    if token.is_space == True:
        # skip it and move onto next token
        continue
    
    # Otherwise,
    else: 
        # keep the tokens'.text for the final list of tokens
        cleaned_tokens.append(token.text.lower())
        
print(cleaned_tokens)

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']


In [22]:
# Obtaining Lemmas

## Adding onto our preprocessing for loop
# For loop to remove stopwords & punctuation
cleaned_lemmas = []
# For each token 
for token in doc:
    
    # If the token is a stopword,
    if token.is_stop == True:
        # skip it and move onto next token
        continue 
    
    # if the token is punctuation,
    if token.is_punct == True:
        # skip it and move onto next token
        continue
    # if the token is a whitespace  (spaces, new lines, etc)
    if token.is_space == True:
        # skip it and move onto next token
        continue
    
    # Otherwise,
    else: 
        # # keep the tokens'.text for the final list of tokens
        # cleaned_tokens.append(token.text.lower())
        # keep the tokens's .lemma_ for the final list of tokens
        cleaned_lemmas.append(token.lemma_.lower())
        
print(cleaned_lemmas)

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


In [23]:
# Compare text and lemmas
print("Tokenized words:\n", cleaned_tokens,"\n")
print("Lemmatized words:\n", cleaned_lemmas)

Tokenized words:
 ['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying'] 

Lemmatized words:
 ['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


In [24]:
# Functionalizing Preprocessing with SpaCy

In [25]:
def preprocess_doc(doc, remove_stopwords=True, remove_punct=True, use_lemmas=False):
    """Temporary Fucntion - for Education Purposes (we will make something better below)
    """
    tokens = [ ]
    for token in doc:
        # Check if should remove stopwords and if token is stopword
        if (remove_stopwords == True) and (token.is_stop == True):
            # Continue the loop with the next token
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_punct == True):
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_space == True):
            continue
    
        ## Determine final form of output list of tokens/lemmas
        if use_lemmas:
            tokens.append(token.lemma_.lower())
        else:
            tokens.append(token.text.lower())
    return tokens

In [26]:
# Convert the text to a doc.
doc = nlp(sample_text)
# Tokenizing, keeping stopwords and punctuatin
dirty_tokens = preprocess_doc(doc, remove_stopwords=False,remove_punct=False)
print(dirty_tokens)

['while', 'running', 'in', 'central', 'park', ',', '\n', 'i', 'noticed', 'a', 'discarded', 'mcdonald', "'s", 'container', ',', 'surounded', 'by', 'buzzing', 'flies', 'was', 'annoying', '.']


In [27]:
# Tokenizing, removing stopwords and punctuation
cleaned_tokens = preprocess_doc(doc, remove_stopwords=True,remove_punct=True)
print(cleaned_tokens)

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']


In [28]:
# Lemmatizing, removing stopwords and punctuation
cleaned_lemmas = preprocess_doc(doc, remove_stopwords=True,remove_punct=True, use_lemmas=True)
print(cleaned_lemmas)

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


In [29]:
# Batch Preprocessing with SpaCy

# Example Framework (Not runnable)
lists_of_texts = [text1, text2, text3]
processed_texts = []
for doc in nlp.pipe(list_of_texts):
    doc_tokens = []
    for token in doc:
        # ... the same logic from our preprocess docs function.
        doc_tokens.append(token.text.lower())
        
    # Append the list of tokens for current doc to processed_texts
    processed_texts.append(doc_tokens)

NameError: name 'text1' is not defined

In [30]:
def batch_preprocess_texts(
    texts,
    nlp=None,
    remove_stopwords=True,
    remove_punct=True,
    use_lemmas=False,
    disable=["ner"],
    batch_size=50,
    n_process=-1,
):
    """Efficiently preprocess a collection of texts using nlp.pipe()
    Args:
        texts (collection of strings): collection of texts to process (e.g. df['text'])
        nlp (spacy pipe), optional): Spacy nlp pipe. Defaults to None; if None, it creates a default 'en_core_web_sm' pipe.
        remove_stopwords (bool, optional): Controls stopword removal. Defaults to True.
        remove_punct (bool, optional): Controls punctuation removal. Defaults to True.
        use_lemmas (bool, optional): lemmatize tokens. Defaults to False.
        disable (list of strings, optional): named pipeline elements to disable. Defaults to ["ner"]: Used with nlp.pipe(disable=disable)
        batch_size (int, optional): Number of texts to process in a batch. Defaults to 50.
        n_process (int, optional): Number of CPU processors to use. Defaults to -1 (meaning all CPU cores).
    Returns:
        list of tokens
    """
    # from tqdm.notebook import tqdm
    from tqdm import tqdm
    if nlp is None:
        nlp = spacy.load("en_core_web_sm")
    processed_texts = []
    for doc in tqdm(nlp.pipe(texts, disable=disable, batch_size=batch_size, n_process=n_process)):
        tokens = []
        for token in doc:
            # Check if should remove stopwords and if token is stopword
            if (remove_stopwords == True) and (token.is_stop == True):
                # Continue the loop with the next token
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_punct == True):
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_space == True):
                continue
            
            ## Determine final form of output list of tokens/lemmas
            if use_lemmas:
                tokens.append(token.lemma_.lower())
            else:
                tokens.append(token.text.lower())
        processed_texts.append(tokens)
    return processed_texts

In [31]:
# Default args will produce tokens
tokens = batch_preprocess_texts([sample_text])
tokens = tokens[0]
print(tokens)

1it [00:08,  8.17s/it]

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']





In [32]:
# Setting use_lemmas = True will produce lemmas
lemmas = batch_preprocess_texts([sample_text], use_lemmas=True)
lemmas = lemmas[0]
print(lemmas)

1it [00:07,  7.95s/it]

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']





In [33]:
sample_text = "While running in Central Park, I noticed that the constant buzzing of flies was annoying. However, I couldn't be too upset as they were likely attracted to the McDonald's food that someone carelessly dropped. I wondered, 'How can they be so uncaring?'"
doc = nlp(sample_text)

In [34]:
# Extracting sentences from doc
sentences = list(doc.sents)
len(sentences)

3

In [35]:
# Display the first sentence
sentences[0]

While running in Central Park, I noticed that the constant buzzing of flies was annoying.

In [36]:
# Print any named entities in the doc and its label
for ent in doc.ents:
    print(ent.text, ent.label_)

Central Park LOC
McDonald ORG
