# Analyzing and Processing Text With SpaCy

# spaCy
It is an open-source natural language processing library for Python. It is designed particularly for production use, and it can help us to build applications that process massive volumes of text efficiently

In [1]:
!pip install spacy
!python -m spacy download en

Collecting spacy
  Downloading spacy-3.2.4-cp39-cp39-win_amd64.whl (11.3 MB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.9-py2.py3-none-any.whl (20 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.6-cp39-cp39-win_amd64.whl (21 kB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp39-cp39-win_amd64.whl (1.0 MB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp39-cp39-win_amd64.whl (36 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.1-py3-none-any.whl (27 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp39-cp39-win_amd64.whl (112 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.7-cp39-cp39-win_amd64.whl (6.6 MB)
Collecting lan

Tokenizing the Text

In [3]:
# word tokenization 

from spacy.lang.en import English

#Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't grt discouraged! challenges and setbacks aren't failures, they're just part of the journey."""

#"nlp" Object is used to create documnets with linguistic annotations.
my_doc = nlp(text)

# create list of word tokens
token_list = []

for token in my_doc:
    token_list.append(token.text)
    
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'grt', 'discouraged', '!', 'challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.']


Sentence Tokenization

In [4]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Add the 'sentencizer' component to the pipeline
nlp.add_pipe('sentencizer')

text = """When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []

for sent in doc.sents:
    sents_list.append(sent.text)
    
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "Challenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


# Cleaning Text Data: Removing Stopwords

In [5]:
#Stop words

#importing stop words from English language.
from spacy.lang.en.stop_words import STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(STOP_WORDS))

#Printing first twenty stop words:
print('First 20 stop words: %s' % list(STOP_WORDS)[:20])

Number of stop words: 326
First 20 stop words: ['have', 'whither', 'the', 'whoever', 'never', 'while', 'side', 'several', 'via', 'as', 'our', 'behind', 'almost', 'no', 'full', 'up', 'alone', 'mostly', 'upon', 'quite']


Removing Stopwords from Our Data

In [6]:
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

filtered_tokens=[]

# filtering stop words and punctuations
for word in doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Sentence:",filtered_tokens)

Filtered Sentence: [learning, data, science, discouraged, Challenges, setbacks, failures, journey, got]


Lexicon Normalization -Lexicon normalization is another step in the text data cleaning process. In the big picture, normalization converts high dimensional features into low dimensional features that are appropriate for any machine learning model. For our purposes here, we’re only going to look at lemmatization, a way of processing words that reduces them to their roots.

# Lemmatization

In [7]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

# Implementing lemmatization
lem = nlp("run runs running runner")

# finding lemma for each word
for word in lem:
    print(word.text,"==>" ,word.lemma_)

run ==> run
runs ==> run
running ==> run
runner ==> runner


In [8]:
# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

text = """When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

filtered_tokens=[]

# filtering stop words and punctuations
for word in doc:
    if word.is_stop==False:
        if word.is_punct==False:
            filtered_tokens.append(word)

print("Filtered Tokens:",filtered_tokens)

normalized_tokens=[]
for token in filtered_tokens:
    normalized_tokens.append(token.lemma_)
    
print("Lemmatized Tokens:",normalized_tokens)

Filtered Tokens: [learning, data, science, discouraged, Challenges, setbacks, failures, journey, got]
Lemmatized Tokens: ['learn', 'datum', 'science', 'discourage', 'challenge', 'setback', 'failure', 'journey', 'get']


# Part of Speech (POS) Tagging

In [9]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm   

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()  

# "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All PRON
is AUX
well ADJ
that PRON
ends VERB
well ADV
. PUNCT
