# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English..

In [4]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'nine', 'some', 'anything', 'first', 'anyway', 'three', 'along', 'formerly', 'either', 'last', 'those', 'anywhere', 'latter', 'should', 'them', 'ten', 'these', 'towards', 'five', 'back', 'everything', 'under', 'once', 'each', 'together', "'re", 'an', 'same', 'below', 'did', 'all', 'been', 'to', 'would', 'therein', "'s", 'hers', 'part', 'something', 'because', 'none', 'which', 'please', 'say', 'there', 'whenever', 'though', 'this', 'than', 'then', '’ve', 'is', 'other', 'wherever', 'as', 'whoever', 'not', 'became', 'be', 'even', 'make', 'onto', 'everyone', 'per', 'your', 'nobody', 'if', 'being', 'almost', 'seemed', 'perhaps', 'thence', "'m", 'others', 'twenty', 'any', 'eight', 'whereupon', 'eleven', 'ever', 'become', 'n‘t', 'will', 'us', 'off', 'done', 'have', 'yourself', 'throughout', 'since', 'our', 'their', 'bottom', 'quite', '‘m', 'becomes', '’d', 'nothing', 'whereby', 'via', 're', 'n’t', 'hereupon', 'thereafter', 'never', 'elsewhere', 'most', 'the', 'cannot', 'about', 'can', 'enoug

In [7]:
len(nlp.Defaults.stop_words)

326

## To see if a word is a stop word

In [8]:
nlp.vocab['myself'].is_stop

True

In [9]:
nlp.vocab['mystery'].is_stop

False

In [None]:
#add

In [10]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [11]:
# Set the stop_word tag on the lexeme
nlp.vocab['mystery'].is_stop = True

In [12]:
len(nlp.Defaults.stop_words)

327

In [13]:
nlp.vocab['mystery'].is_stop

True

## To remove a stop word
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [14]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [15]:
len(nlp.Defaults.stop_words)

326

In [16]:
nlp.vocab['beyond'].is_stop

False

In [17]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [18]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [19]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [20]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]')


In [21]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [22]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [25]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [26]:
stop=set(stopwords.words("english"))
words=[w for w in words if not w in stop]
print (words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [24]:
nlp.vocab['dog'].is_stop

False