###Spacy, nltk, stanza

###Stemming, chunking, lemmatization, position tagging

In [None]:
import spacy

In [None]:
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

In [None]:
!pip install git+https://github.com/stanfordnlp/stanza.git

Collecting git+https://github.com/stanfordnlp/stanza.git
  Cloning https://github.com/stanfordnlp/stanza.git to /tmp/pip-req-build-rea67kdr
  Running command git clone --filter=blob:none --quiet https://github.com/stanfordnlp/stanza.git /tmp/pip-req-build-rea67kdr
  Resolved https://github.com/stanfordnlp/stanza.git to commit c530c9af647d521262b56b717bcc38b0cfc5f1b8
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji (from stanza==1.5.0)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: stanza
  Building wheel for stanza (setup.py) ... [?25l[?25hdone
  Created wheel for stanza: filename=stanza-1.5.0-py3-none-any.whl size=801154 sha256=97bcaac0e89ee9709852fe076300ad1e1171d6c2573d8a03200bbfce879409ba
  Stored in directory: /tmp/pip-ephem-wheel-cache-wnwfz0qj/wheels/de/ef/00/57f99c0d83685d3e4565844fda1faa

In [None]:
import stanza

In [None]:
sentence = "From a little after two o’clock until almost sundown of the long still hot weary dead September afternoon they sat in what Miss Coldfield still called the office because her father had called it that–a dim hot airless room with the blinds all closed and fastened for forty-three summers because when she was a girl someone had believed that light and moving air carried heat and that dark was always cooler, and which (as the sun shone fuller and fuller on that side of the house) became latticed with yellow slashes full of dust motes which Qunetin thought of as being flecks of the dead old dried paint itself blown inward from the scaling blinds as wind might have blown them."

# Spacy

In [None]:
nlp1 = spacy.load("en_core_web_sm")
doc = nlp1(sentence)

In [None]:
chunks = []
for chunk in doc.noun_chunks:
    chunks.append(chunk.text)
for chunk in doc:
    if "VERB" in chunk.pos_:
        chunks.append(chunk.text)
print("Chunks:", chunks)

Chunks: ['two o’clock', 'they', 'what', 'Miss Coldfield', 'the office', 'her father', 'it', 'that–a dim hot airless room', 'the blinds', 'all', 'forty-three summers', 'she', 'a girl', 'someone', 'light and moving air', 'heat', 'that dark', 'which', 'the sun', 'that side', 'the house', 'yellow slashes', 'dust motes', 'which', 'Qunetin', 'flecks', 'the dead old dried paint', 'itself', 'the scaling blinds', 'wind', 'them', 'sundown', 'sat', 'called', 'called', 'closed', 'fastened', 'believed', 'moving', 'carried', 'shone', 'latticed', 'thought', 'dried', 'blown', 'blown']


In [None]:
lemmatized_tokens = [token.lemma_ for token in doc]
print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['from', 'a', 'little', 'after', 'two', 'o’clock', 'until', 'almost', 'sundown', 'of', 'the', 'long', 'still', 'hot', 'weary', 'dead', 'September', 'afternoon', 'they', 'sit', 'in', 'what', 'Miss', 'Coldfield', 'still', 'call', 'the', 'office', 'because', 'her', 'father', 'have', 'call', 'it', 'that', '–', 'a', 'dim', 'hot', 'airless', 'room', 'with', 'the', 'blind', 'all', 'close', 'and', 'fasten', 'for', 'forty', '-', 'three', 'summer', 'because', 'when', 'she', 'be', 'a', 'girl', 'someone', 'have', 'believe', 'that', 'light', 'and', 'move', 'air', 'carry', 'heat', 'and', 'that', 'dark', 'be', 'always', 'cool', ',', 'and', 'which', '(', 'as', 'the', 'sun', 'shine', 'fuller', 'and', 'full', 'on', 'that', 'side', 'of', 'the', 'house', ')', 'became', 'lattice', 'with', 'yellow', 'slash', 'full', 'of', 'dust', 'mote', 'which', 'Qunetin', 'think', 'of', 'as', 'be', 'fleck', 'of', 'the', 'dead', 'old', 'dry', 'paint', 'itself', 'blow', 'inward', 'from', 'the', 'scaling',

In [None]:
pos_tags = [(token.text, token.pos_) for token in doc]
print("POS tags:", pos_tags)

POS tags: [('From', 'ADP'), ('a', 'DET'), ('little', 'ADJ'), ('after', 'ADP'), ('two', 'NUM'), ('o’clock', 'NOUN'), ('until', 'ADP'), ('almost', 'ADV'), ('sundown', 'VERB'), ('of', 'ADP'), ('the', 'DET'), ('long', 'ADJ'), ('still', 'ADV'), ('hot', 'ADJ'), ('weary', 'ADJ'), ('dead', 'ADJ'), ('September', 'PROPN'), ('afternoon', 'NOUN'), ('they', 'PRON'), ('sat', 'VERB'), ('in', 'ADP'), ('what', 'PRON'), ('Miss', 'PROPN'), ('Coldfield', 'PROPN'), ('still', 'ADV'), ('called', 'VERB'), ('the', 'DET'), ('office', 'NOUN'), ('because', 'SCONJ'), ('her', 'PRON'), ('father', 'NOUN'), ('had', 'AUX'), ('called', 'VERB'), ('it', 'PRON'), ('that', 'DET'), ('–', 'PUNCT'), ('a', 'DET'), ('dim', 'ADJ'), ('hot', 'ADJ'), ('airless', 'NOUN'), ('room', 'NOUN'), ('with', 'ADP'), ('the', 'DET'), ('blinds', 'NOUN'), ('all', 'PRON'), ('closed', 'VERB'), ('and', 'CCONJ'), ('fastened', 'VERB'), ('for', 'ADP'), ('forty', 'NUM'), ('-', 'PUNCT'), ('three', 'NUM'), ('summers', 'NOUN'), ('because', 'SCONJ'), ('when'

### Stemming is not supported by spacy

## NLTK

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
tokens = word_tokenize(sentence)
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print("Stemmed tokens:", stemmed_tokens)

Stemmed tokens: ['from', 'a', 'littl', 'after', 'two', 'o', '’', 'clock', 'until', 'almost', 'sundown', 'of', 'the', 'long', 'still', 'hot', 'weari', 'dead', 'septemb', 'afternoon', 'they', 'sat', 'in', 'what', 'miss', 'coldfield', 'still', 'call', 'the', 'offic', 'becaus', 'her', 'father', 'had', 'call', 'it', 'that–a', 'dim', 'hot', 'airless', 'room', 'with', 'the', 'blind', 'all', 'close', 'and', 'fasten', 'for', 'forty-thre', 'summer', 'becaus', 'when', 'she', 'wa', 'a', 'girl', 'someon', 'had', 'believ', 'that', 'light', 'and', 'move', 'air', 'carri', 'heat', 'and', 'that', 'dark', 'wa', 'alway', 'cooler', ',', 'and', 'which', '(', 'as', 'the', 'sun', 'shone', 'fuller', 'and', 'fuller', 'on', 'that', 'side', 'of', 'the', 'hous', ')', 'becam', 'lattic', 'with', 'yellow', 'slash', 'full', 'of', 'dust', 'mote', 'which', 'qunetin', 'thought', 'of', 'as', 'be', 'fleck', 'of', 'the', 'dead', 'old', 'dri', 'paint', 'itself', 'blown', 'inward', 'from', 'the', 'scale', 'blind', 'as', 'wind

In [None]:
chunk_grammar = r"""NP: {<DT>?<JJ>*<NN>}"""
chunk_parser = RegexpParser(chunk_grammar)
chunks = chunk_parser.parse(pos_tags)

print("Chunks:", chunks)

Chunks: (S
  From/ADP
  a/DET
  little/ADJ
  after/ADP
  two/NUM
  o’clock/NOUN
  until/ADP
  almost/ADV
  sundown/VERB
  of/ADP
  the/DET
  long/ADJ
  still/ADV
  hot/ADJ
  weary/ADJ
  dead/ADJ
  September/PROPN
  afternoon/NOUN
  they/PRON
  sat/VERB
  in/ADP
  what/PRON
  Miss/PROPN
  Coldfield/PROPN
  still/ADV
  called/VERB
  the/DET
  office/NOUN
  because/SCONJ
  her/PRON
  father/NOUN
  had/AUX
  called/VERB
  it/PRON
  that/DET
  –/PUNCT
  a/DET
  dim/ADJ
  hot/ADJ
  airless/NOUN
  room/NOUN
  with/ADP
  the/DET
  blinds/NOUN
  all/PRON
  closed/VERB
  and/CCONJ
  fastened/VERB
  for/ADP
  forty/NUM
  -/PUNCT
  three/NUM
  summers/NOUN
  because/SCONJ
  when/SCONJ
  she/PRON
  was/AUX
  a/DET
  girl/NOUN
  someone/PRON
  had/AUX
  believed/VERB
  that/SCONJ
  light/NOUN
  and/CCONJ
  moving/VERB
  air/NOUN
  carried/VERB
  heat/NOUN
  and/CCONJ
  that/DET
  dark/NOUN
  was/AUX
  always/ADV
  cooler/ADJ
  ,/PUNCT
  and/CCONJ
  which/PRON
  (/PUNCT
  as/SCONJ
  the/DET
  sun/NOU

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['From', 'a', 'little', 'after', 'two', 'o', '’', 'clock', 'until', 'almost', 'sundown', 'of', 'the', 'long', 'still', 'hot', 'weary', 'dead', 'September', 'afternoon', 'they', 'sat', 'in', 'what', 'Miss', 'Coldfield', 'still', 'called', 'the', 'office', 'because', 'her', 'father', 'had', 'called', 'it', 'that–a', 'dim', 'hot', 'airless', 'room', 'with', 'the', 'blind', 'all', 'closed', 'and', 'fastened', 'for', 'forty-three', 'summer', 'because', 'when', 'she', 'wa', 'a', 'girl', 'someone', 'had', 'believed', 'that', 'light', 'and', 'moving', 'air', 'carried', 'heat', 'and', 'that', 'dark', 'wa', 'always', 'cooler', ',', 'and', 'which', '(', 'a', 'the', 'sun', 'shone', 'fuller', 'and', 'fuller', 'on', 'that', 'side', 'of', 'the', 'house', ')', 'became', 'latticed', 'with', 'yellow', 'slash', 'full', 'of', 'dust', 'mote', 'which', 'Qunetin', 'thought', 'of', 'a', 'being', 'fleck', 'of', 'the', 'dead', 'old', 'dried', 'paint', 'itself', 'blown', 'inward', 'from', 'the

In [None]:
pos_tags = pos_tag(tokens)

print("POS tags:", pos_tags)

POS tags: [('From', 'IN'), ('a', 'DT'), ('little', 'JJ'), ('after', 'IN'), ('two', 'CD'), ('o', 'JJ'), ('’', 'FW'), ('clock', 'NN'), ('until', 'IN'), ('almost', 'RB'), ('sundown', 'VBN'), ('of', 'IN'), ('the', 'DT'), ('long', 'JJ'), ('still', 'RB'), ('hot', 'JJ'), ('weary', 'JJ'), ('dead', 'JJ'), ('September', 'NNP'), ('afternoon', 'NN'), ('they', 'PRP'), ('sat', 'VBD'), ('in', 'IN'), ('what', 'WP'), ('Miss', 'NNP'), ('Coldfield', 'NNP'), ('still', 'RB'), ('called', 'VBD'), ('the', 'DT'), ('office', 'NN'), ('because', 'IN'), ('her', 'PRP$'), ('father', 'NN'), ('had', 'VBD'), ('called', 'VBN'), ('it', 'PRP'), ('that–a', 'JJ'), ('dim', 'JJ'), ('hot', 'JJ'), ('airless', 'NN'), ('room', 'NN'), ('with', 'IN'), ('the', 'DT'), ('blinds', 'NNS'), ('all', 'DT'), ('closed', 'VBD'), ('and', 'CC'), ('fastened', 'VBD'), ('for', 'IN'), ('forty-three', 'JJ'), ('summers', 'NNS'), ('because', 'IN'), ('when', 'WRB'), ('she', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('girl', 'NN'), ('someone', 'NN'), ('had',

Stanza

In [None]:
stanza.download('en')

nlp3 = stanza.Pipeline('en')

doc = nlp3(sentence)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [None]:
lemmatized_tokens = [word.lemma for sent in doc.sentences for word in sent.words]

print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['from', 'a', 'little', 'after', 'two', "o'clock", 'until', 'almost', 'sundown', 'of', 'the', 'long', 'still', 'hot', 'weary', 'dead', 'September', 'afternoon', 'they', 'sit', 'in', 'what', 'Miss', 'Coldfield', 'still', 'call', 'the', 'office', 'because', 'she', 'father', 'have', 'call', 'it', 'that', '–a', 'dim', 'hot', 'airless', 'room', 'with', 'the', 'blind', 'all', 'close', 'and', 'fasten', 'for', 'forty', '-', 'three', 'summer', 'because', 'when', 'she', 'be', 'a', 'girl', 'someone', 'have', 'believe', 'that', 'light', 'and', 'move', 'air', 'carry', 'heat', 'and', 'that', 'dark', 'be', 'always', 'cool', ',', 'and', 'which', '(', 'as', 'the', 'sun', 'shine', 'full', 'and', 'full', 'on', 'that', 'side', 'of', 'the', 'house', ')', 'become', 'latticed', 'with', 'yellow', 'slash', 'full', 'of', 'dust', 'mote', 'which', 'Qunetin', 'think', 'of', 'as', 'be', 'fleck', 'of', 'the', 'dead', 'old', 'dry', 'paint', 'itself', 'blow', 'inward', 'from', 'the', 'scaling', 'bli

In [None]:
pos_tags = [f"{word.text}/{word.upos}" for sent in doc.sentences for word in sent.words]

print("POS tags:", pos_tags)

POS tags: ['From/ADP', 'a/DET', 'little/ADJ', 'after/ADP', 'two/NUM', 'o’clock/NOUN', 'until/ADP', 'almost/ADV', 'sundown/NOUN', 'of/ADP', 'the/DET', 'long/ADJ', 'still/ADV', 'hot/ADJ', 'weary/ADJ', 'dead/ADJ', 'September/PROPN', 'afternoon/NOUN', 'they/PRON', 'sat/VERB', 'in/ADP', 'what/PRON', 'Miss/PROPN', 'Coldfield/PROPN', 'still/ADV', 'called/VERB', 'the/DET', 'office/NOUN', 'because/SCONJ', 'her/PRON', 'father/NOUN', 'had/AUX', 'called/VERB', 'it/PRON', 'that/SCONJ', '–a/DET', 'dim/ADJ', 'hot/ADJ', 'airless/ADJ', 'room/NOUN', 'with/ADP', 'the/DET', 'blinds/NOUN', 'all/DET', 'closed/VERB', 'and/CCONJ', 'fastened/VERB', 'for/ADP', 'forty/NUM', '-/PUNCT', 'three/NUM', 'summers/NOUN', 'because/SCONJ', 'when/ADV', 'she/PRON', 'was/AUX', 'a/DET', 'girl/NOUN', 'someone/PRON', 'had/AUX', 'believed/VERB', 'that/SCONJ', 'light/NOUN', 'and/CCONJ', 'moving/VERB', 'air/NOUN', 'carried/VERB', 'heat/NOUN', 'and/CCONJ', 'that/DET', 'dark/NOUN', 'was/AUX', 'always/ADV', 'cooler/ADJ', ',/PUNCT', '

### stemming and chunking are not supported by stanza

# In conclusion, Spacy ad Stanza can process longer sentences compared to nltk