In [1]:
# Why do we need stopwords and more importantly what are stopwords?
# Words that carry little to no meaningful information are called stopwords. Eg: is, was, have, has, I, They etc..
# Since these words appear frequently in text but don't contribute much to the actual meaning, many NLP applications, 
# such as search engines or text analysis tools, remove them to focus on more meaningful content.

In [2]:
# the complete works of william shakespeare (https://www.gutenberg.org/)
paragraph = """When I consider everything that grows
            Holds in perfection but a little moment.
            That this huge stage presenteth nought but shows
            Whereon the stars in secret influence comment.
            When I perceive that men as plants increase,
            Cheered and checked even by the self-same sky:
            Vaunt in their youthful sap, at height decrease,
            And wear their brave state out of memory.z
            Then the conceit of this inconstant stay,
            Sets you most rich in youth before my sight,
            Where wasteful Time debateth with Decay
            To change your day of youth to sullied night,
            And all in war with Time for love of you,
            As he takes from you, I engraft you new.
            But wherefore do not you a mightier way
            Make war upon this bloody tyrant Time?
            And fortify yourself in your decay
            With means more blessed than my barren rhyme?
            Now stand you on the top of happy hours,
            And many maiden gardens yet unset,
            With virtuous wish would bear you living flowers,
            Much liker than your painted counterfeit:
            So should the lines of life that life repair
            Which this (Time’s pencil) or my pupil pen
            Neither in inward worth nor outward fair
            Can make you live yourself in eyes of men."""

In [3]:
# import from nltk library
from nltk.corpus import stopwords

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
# lets look at the stopwords. we have different stopwords for different languages. 
print("stopwords in english language: \n", stopwords.words("english"))

stopwords in english language: 
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only

In [9]:
# german
print("stopwords in english language: \n", stopwords.words("german"))

stopwords in english language: 
 ['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', 'dass', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es', 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem'

In [13]:
# consider the paragraph -> filter the stopwords -> perform stemming
# convert paragraph into sentences
sentences = nltk.sent_tokenize(paragraph)
print(sentences)

['When I consider everything that grows\n            Holds in perfection but a little moment.', 'That this huge stage presenteth nought but shows\n            Whereon the stars in secret influence comment.', 'When I perceive that men as plants increase,\n            Cheered and checked even by the self-same sky:\n            Vaunt in their youthful sap, at height decrease,\n            And wear their brave state out of memory.', 'Then the conceit of this inconstant stay,\n            Sets you most rich in youth before my sight,\n            Where wasteful Time debateth with Decay\n            To change your day of youth to sullied night,\n            And all in war with Time for love of you,\n            As he takes from you, I engraft you new.', 'But wherefore do not you a mightier way\n            Make war upon this bloody tyrant Time?', 'And fortify yourself in your decay\n            With means more blessed than my barren rhyme?', 'Now stand you on the top of happy hours,\n        

In [14]:
# import stemmer
from nltk.stem import SnowballStemmer

In [15]:
snowball_stemmer = SnowballStemmer(language="english")

In [17]:
# main task
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [snowball_stemmer.stem(word) for word in words if word not in set(stopwords.words("english"))]
    sentences[i] = ' '.join(words)

print(sentences)
# NOTICE: no captial letters. 

['when i consid everyth grow hold perfect littl moment .', 'that huge stage presenteth nought show whereon star secret influenc comment .', 'when i perceiv men plant increas , cheer check even self-sam sky : vaunt youth sap , height decreas , and wear brave state memori .', 'then conceit inconst stay , set rich youth sight , where wast time debateth decay to chang day youth sulli night , and war time love , as take , i engraft new .', 'but wherefor mightier way make war upon bloodi tyrant time ?', 'and fortifi decay with mean bless barren rhyme ?', 'now stand top happi hour , and mani maiden garden yet unset , with virtuous wish would bear live flower , much liker paint counterfeit : so line life life repair which ( time ’ pencil ) pupil pen neither inward worth outward fair can make live eye men .']


In [18]:
# obviously, there will be words with no meaning due to the stemming approach.
# we can also work with lemmatization
from nltk.stem import WordNetLemmatizer

In [19]:
lemmatizer = WordNetLemmatizer()

In [20]:
# intialise the sentences, once again
sentences = nltk.sent_tokenize(paragraph)

In [22]:
# main task
for i in range(len(sentences)):
#     sentences[i] = sentences[i].lower() # if you want to work with lower case of words
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word, pos='v') for word in words if word not in set(stopwords.words("english"))]
    sentences[i] = ' '.join(words)

print(sentences)
# NOTICE: i have the captial letters and most importantly all the words make sense.

['When I consider everything grow Holds perfection little moment .', 'That huge stage presenteth nought show Whereon star secret influence comment .', 'When I perceive men plant increase , Cheered check even self-same sky : Vaunt youthful sap , height decrease , And wear brave state memory .', 'Then conceit inconstant stay , Sets rich youth sight , Where wasteful Time debateth Decay To change day youth sully night , And war Time love , As take , I engraft new .', 'But wherefore mightier way Make war upon bloody tyrant Time ?', 'And fortify decay With mean bless barren rhyme ?', 'Now stand top happy hour , And many maiden garden yet unset , With virtuous wish would bear live flower , Much liker paint counterfeit : So line life life repair Which ( Time ’ pencil ) pupil pen Neither inward worth outward fair Can make live eye men .']
