<a href="https://colab.research.google.com/github/ravi-kr/NaturalLanguageProcessing/blob/master/StopWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English.. 

In [1]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Print the set of spacy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'empty', 'own', 'be', 'elsewhere', 'every', 'give', 'somehow', 'of', 'unless', '‘re', 'former', 'take', 'am', 'make', 'behind', 'being', 'seeming', 'we', 'has', 'beside', 'really', 'everyone', 'show', 'that', 'full', 'much', 'it', 'alone', 'he', 'several', 'up', 'anyway', 'part', 'becoming', 'may', 'whole', 'me', 'hereupon', 'there', 'herself', 'many', 'most', 'both', 'seems', 'thereafter', 'anything', 'for', 'thru', 'out', 'eleven', 'three', 'during', 'even', 'first', 'ca', 'six', 'after', 'therefore', 'onto', 'ourselves', 'than', '‘ve', 'call', 'more', 're', '’ve', 'does', 'hence', "'d", 'would', 'thence', 'around', 'enough', 'herein', 'into', 'other', 'our', 'she', 'since', 'two', "'s", 'beforehand', 'everywhere', 'until', 'had', 'here', 'meanwhile', 'ever', 'thereupon', 'whenever', 'yours', 'have', 'now', 'then', 'toward', 'because', 'amount', 'seem', 'itself', 'third', 'was', 'my', 'via', 'nobody', 'were', 'due', 'few', 'no', 'get', 'on', 'should', 'another', 'down', 'themselves'

In [4]:
len(nlp.Defaults.stop_words)

326

## To see if a word is a stop word

In [5]:
nlp.vocab['myself'].is_stop

True

In [6]:
nlp.vocab['mystery'].is_stop

False

In [7]:
# Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [8]:
# Set the stop_word tag on the lexeme
nlp.vocab['mystery'].is_stop = True

In [9]:
len(nlp.Defaults.stop_words)

327

In [10]:
nlp.vocab['mystery'].is_stop

True

## To remove a stop word
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [11]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

In [12]:
len(nlp.Defaults.stop_words)

326

In [13]:
nlp.vocab['beyond'].is_stop

False

In [14]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [16]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [17]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]')


In [18]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [19]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [20]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [21]:
nlp.vocab['dog'].is_stop

False