# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English.. 

In [None]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'say', "'re", 'various', 'anything', '‘ll', 'well', 'same', 'afterwards', 'ours', 'meanwhile', 'only', 'though', 'him', 'onto', 'there', 'they', 'by', 'least', 'put', 'however', 'ca', 'thereupon', 'nobody', 'every', 'already', 'is', 'now', 'rather', 'top', 'where', 'never', 'fifteen', 'thus', 'mine', 'eight', 'hereby', 'herein', 'eleven', 'if', 'across', 'go', 'move', 'someone', 'although', 'without', 'how', 'out', 'five', 'beside', 'empty', 'will', 'whereby', 'former', 'amount', 'make', 'you', 'these', 'all', 'everything', 'some', 'not', 'seem', 'fifty', 'therefore', 'done', 'alone', 'beforehand', 'himself', 'yours', 'i', 'on', 'yourself', 'call', 'none', 'many', '‘d', 'six', 'another', '’s', 'am', 'four', 'most', 'could', 'again', 'themselves', 'were', 'than', 'seemed', 'therein', 'along', 'was', 'whereupon', 'with', "n't", 'just', 'n‘t', '‘ve', 'their', 'less', '’ve', 'down', 'when', 'what', 'anyone', 'over', 'around', "'ve", 'except', 'me', 'always', 'against', 'something', 'where

In [None]:
len(nlp.Defaults.stop_words)

326

## To see if a word is a stop word

In [None]:
nlp.vocab['myself'].is_stop

True

In [None]:
nlp.vocab['mystery'].is_stop

False

In [None]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [None]:
# Set the stop_word tag on the lexeme
nlp.vocab['mystery'].is_stop = True

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
nlp.vocab['mystery'].is_stop

True

## To remove a stop word
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [None]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [None]:
len(nlp.Defaults.stop_words)

326

In [None]:
nlp.vocab['beyond'].is_stop

False

In [None]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [None]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [None]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]')


In [None]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [None]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [None]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [None]:
nlp.vocab['dog'].is_stop

False