spaCy is a free, open-source library for NLP in Python written in Cython. spaCy is designed to make it easy to build systems for information extraction or general-purpose natural language processing.

In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x1fe86d75340>

In [12]:
introduction_doc = nlp(
"To start processing your input, you construct a Doc object. A Doc object is a sequence of Token objects representing a lexical token. Each Token object has information about a particular piece—typically one word—of text. You can instantiate a Doc object by calling the Language object with the input string as an argument")
type(introduction_doc)


[token.text for token in introduction_doc]

['To',
 'start',
 'processing',
 'your',
 'input',
 ',',
 'you',
 'construct',
 'a',
 'Doc',
 'object',
 '.',
 'A',
 'Doc',
 'object',
 'is',
 'a',
 'sequence',
 'of',
 'Token',
 'objects',
 'representing',
 'a',
 'lexical',
 'token',
 '.',
 'Each',
 'Token',
 'object',
 'has',
 'information',
 'about',
 'a',
 'particular',
 'piece',
 '—',
 'typically',
 'one',
 'word',
 '—',
 'of',
 'text',
 '.',
 'You',
 'can',
 'instantiate',
 'a',
 'Doc',
 'object',
 'by',
 'calling',
 'the',
 'Language',
 'object',
 'with',
 'the',
 'input',
 'string',
 'as',
 'an',
 'argument']

In [13]:
import pathlib
file_name = "intro.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print ([token.text for token in introduction_doc])

['Lemmatization', 'is', 'the', 'process', 'of', 'reducing', 'inflected', 'forms', 'of', 'a', 'word', 'while', 'still', 'ensuring', 'that', 'the', 'reduced', 'form', 'belongs', 'to', 'the', 'language', '.', 'This', 'reduced', 'form', ',', 'or', 'root', 'word', ',', 'is', 'called', 'a', 'lemma', '.', '\n\n', 'For', 'example', ',', 'organizes', ',', 'organized', 'and', 'organizing', 'are', 'all', 'forms', 'of', 'organize', '.', 'Here', ',', 'organize', 'is', 'the', 'lemma', '.', 'The', 'inflection', 'of', 'a', 'word', 'allows', 'you', 'to', 'express', 'different', 'grammatical', 'categories', ',', 'like', 'tense', '(', 'organized', 'vs', 'organize', ')', ',', 'number', '(', 'trains', 'vs', 'train', ')', ',', 'and', 'so', 'on', '.', 'Lemmatization', 'is', 'necessary', 'because', 'it', 'helps', 'you', 'reduce', 'the', 'inflected', 'forms', 'of', 'a', 'word', 'so', 'that', 'they', 'can', 'be', 'analyzed', 'as', 'a', 'single', 'item', '.', 'It', 'can', 'also', 'help', 'you', 'normalize', 'the

In [14]:
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(introduction_doc)
print([token for token in about_doc if not token.is_stop])


[Lemmatization, process, reducing, inflected, forms, word, ensuring, reduced, form, belongs, language, ., reduced, form, ,, root, word, ,, called, lemma, ., 

, example, ,, organizes, ,, organized, organizing, forms, organize, ., ,, organize, lemma, ., inflection, word, allows, express, different, grammatical, categories, ,, like, tense, (, organized, vs, organize, ), ,, number, (, trains, vs, train, ), ,, ., Lemmatization, necessary, helps, reduce, inflected, forms, word, analyzed, single, item, ., help, normalize, text, .]


In [15]:
for token in about_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")


       Lemmatization : lemmatization
                  is : be
            reducing : reduce
           inflected : inflect
               forms : form
            ensuring : ensure
             reduced : reduce
             belongs : belong
                This : this
                  is : be
              called : call
                 For : for
           organizes : organize
           organized : organize
                 are : be
               forms : form
                Here : here
                  is : be
                 The : the
              allows : allow
          categories : category
           organized : organize
              trains : train
       Lemmatization : lemmatization
                  is : be
               helps : help
               forms : form
            analyzed : analyze
                  It : it


In [18]:
from collections import Counter
words = [
    token.text
    for token in about_doc
    if not token.is_stop and not token.is_punct
]
print(Counter(words).most_common(5))


[('word', 4), ('forms', 3), ('organize', 3), ('Lemmatization', 2), ('inflected', 2)]


In [19]:
Counter(
    [token.text for token in about_doc if not token.is_punct]
).most_common(5)

[('the', 6), ('of', 5), ('a', 5), ('is', 4), ('word', 4)]

In [20]:
for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
=====
TAG: {str(token.tag_):10} POS: {token.pos_}
EXPLANATION: {spacy.explain(token.tag_)}"""
    )


TOKEN: Lemmatization
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: the
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: process
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: of
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: reducing
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: inflected
=====
TAG: VBN        POS: VERB
EXPLANATION: verb, past participle

TOKEN: forms
=====
TAG: NNS        POS: NOUN
EXPLANATION: noun, plural

TOKEN: of
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: word
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: while
=====
TAG: IN         POS: SCONJ
EXPLANATION: conjunction, sub

In [24]:
nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)


nouns

adjectives


[reduced, different, grammatical, tense, necessary, inflected, single]

In [25]:
from spacy import displacy

displacy.serve(about_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [13/Aug/2024 11:09:06] "GET / HTTP/1.1" 200 84987
127.0.0.1 - - [13/Aug/2024 11:09:06] "GET /favicon.ico HTTP/1.1" 200 84987


Shutting down server on port 5000.


In [26]:
def is_token_allowed(token):
    return bool(
        token
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
    )

def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [
    preprocess_token(token)
    for token in about_doc
    if is_token_allowed(token)
]

complete_filtered_tokens

['lemmatization',
 'process',
 'reduce',
 'inflect',
 'form',
 'word',
 'ensure',
 'reduce',
 'form',
 'belong',
 'language',
 'reduced',
 'form',
 'root',
 'word',
 'call',
 'lemma',
 'example',
 'organize',
 'organize',
 'organizing',
 'form',
 'organize',
 'organize',
 'lemma',
 'inflection',
 'word',
 'allow',
 'express',
 'different',
 'grammatical',
 'category',
 'like',
 'tense',
 'organize',
 'vs',
 'organize',
 'number',
 'train',
 'vs',
 'train',
 'lemmatization',
 'necessary',
 'help',
 'reduce',
 'inflected',
 'form',
 'word',
 'analyze',
 'single',
 'item',
 'help',
 'normalize',
 'text']