In [1]:
#Using spacy to load the model
import spacy
nlp = spacy.load("en_core_web_sm")

<spacy.lang.en.English at 0x21dab820730>

In [2]:
# Create a Doc object
introduction_do = nlp(
    "This tutorial is about Natural Language Processing in spaCy."
)
type(introduction_do)


[token.text for token in introduction_do]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [3]:
# Load a text file and convert it to a spaCy document
import pathlib
file_name = "textfile.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print ([token.text for token in introduction_doc])

['The', 'shimmering', 'moonlight', 'danced', 'across', 'the', 'tranquil', 'lake', ',', 'casting', 'ethereal', 'reflections', 'on', 'the', 'water', "'s", 'surface', '.', 'Whispers', 'of', 'the', 'night', 'breeze', 'rustled', 'through', 'the', 'tall', 'pines', ',', 'creating', 'a', 'symphony', 'of', 'nature', "'s", 'lullaby', '.', 'Fireflies', 'flickered', 'like', 'stars', ',', 'weaving', 'a', 'tapestry', 'of', 'light', '.', 'A', 'sense', 'of', 'peace', 'enveloped', 'the', 'forest', ',', 'serene', 'and', 'timeless', 'danced', '.']


In [4]:
# Tokenization
for token in introduction_doc:
    print (token, token.idx)

The 0
shimmering 4
moonlight 15
danced 25
across 32
the 39
tranquil 43
lake 52
, 56
casting 58
ethereal 66
reflections 75
on 87
the 90
water 94
's 99
surface 102
. 109
Whispers 111
of 120
the 123
night 127
breeze 133
rustled 140
through 148
the 156
tall 160
pines 165
, 170
creating 172
a 181
symphony 183
of 192
nature 195
's 201
lullaby 204
. 211
Fireflies 213
flickered 223
like 233
stars 238
, 243
weaving 245
a 253
tapestry 255
of 264
light 267
. 272
A 274
sense 276
of 282
peace 285
enveloped 291
the 301
forest 305
, 311
serene 313
and 320
timeless 324
danced 333
. 339


In [5]:
#Stop Words Removal

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(len(spacy_stopwords))

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

326
hereafter
part
take
however
even
such
neither
across
say
several


In [6]:
#Stop Words Removal
filtered_tokens = [token for token in introduction_doc if not token.is_stop]
print(filtered_tokens)


[shimmering, moonlight, danced, tranquil, lake, ,, casting, ethereal, reflections, water, surface, ., Whispers, night, breeze, rustled, tall, pines, ,, creating, symphony, nature, lullaby, ., Fireflies, flickered, like, stars, ,, weaving, tapestry, light, ., sense, peace, enveloped, forest, ,, serene, timeless, danced, .]


In [7]:
#leminization
for token in filtered_tokens:
    # if str(token) != str(token.lemma_):
    print(f"{str(token):>20} : {str(token.lemma_)}")

          shimmering : shimmering
           moonlight : moonlight
              danced : dance
            tranquil : tranquil
                lake : lake
                   , : ,
             casting : cast
            ethereal : ethereal
         reflections : reflection
               water : water
             surface : surface
                   . : .
            Whispers : whisper
               night : night
              breeze : breeze
             rustled : rustle
                tall : tall
               pines : pine
                   , : ,
            creating : create
            symphony : symphony
              nature : nature
             lullaby : lullaby
                   . : .
           Fireflies : firefly
           flickered : flicker
                like : like
               stars : star
                   , : ,
             weaving : weave
            tapestry : tapestry
               light : light
                   . : .
               sense : sense
    

In [9]:
from typing import Counter


words = [
    token.text
    for token in filtered_tokens
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common())

[('danced', 2), ('shimmering', 1), ('moonlight', 1), ('tranquil', 1), ('lake', 1), ('casting', 1), ('ethereal', 1), ('reflections', 1), ('water', 1), ('surface', 1), ('Whispers', 1), ('night', 1), ('breeze', 1), ('rustled', 1), ('tall', 1), ('pines', 1), ('creating', 1), ('symphony', 1), ('nature', 1), ('lullaby', 1), ('Fireflies', 1), ('flickered', 1), ('like', 1), ('stars', 1), ('weaving', 1), ('tapestry', 1), ('light', 1), ('sense', 1), ('peace', 1), ('enveloped', 1), ('forest', 1), ('serene', 1), ('timeless', 1)]
