<a href="https://colab.research.google.com/github/sidharth178/Natural-Language-Processing-Tutorial/blob/master/3_StopWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stop Words
Stop words are those words that do not contribute to the deeper meaning of the phrase. They are the most common words such as: the, a, and is. For some applications like documentation classification, it may make sense to remove stop words. NLTK provides a list of commonly agreed upon stop words for a variety of languages, such as English.. 

In [None]:
# Perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Print the set of spaCy's default stop words (remember that sets are unordered):
print(nlp.Defaults.stop_words)

{'although', 'thereafter', 'elsewhere', 'among', 'another', 'almost', 'anywhere', 'already', 'sixty', 'own', 'toward', '‘ve', 'whatever', 'whether', 'did', 'of', 'within', 'here', 'using', 'would', '‘m', 'those', 'six', 'formerly', 'full', 'not', 'yet', 'five', 'everywhere', 'thereby', "n't", 'neither', 'except', 'along', 'from', 'latter', 'quite', 'herein', 'used', 'whereby', '’re', 'but', "'m", 'whence', 'hereby', 'next', 'a', 'somehow', 'by', 'too', 'therefore', 'ten', 'to', 'doing', 'whom', 'though', 'ourselves', 'n‘t', 'because', 'sometime', 'see', 'upon', 'becoming', 'mostly', 'could', 'something', 'via', 'none', 'on', 'for', 'back', 'ca', 'nine', 'during', 'most', 'should', 'hereupon', '‘re', 'anything', 'alone', 'whole', 'her', 'eight', 'will', 'such', 'moreover', 'which', 'regarding', 'out', 'at', 'then', 'empty', 'others', 'also', 'it', 'ever', 'they', 'even', 'meanwhile', "'re", 'fifteen', 'besides', 'very', 'him', 'anyone', "'ve", '’s', 'keep', 'thru', 'just', 'hereafter', 

In [None]:
from nltk.corpus import stopwords 
stopwords.words('english') # all stopwords in english language

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
len(nlp.Defaults.stop_words)

326

### **Check if a word is a stop word or not( use "vocab" method)**

In [None]:
# We can check a word that is stopword or not by using vocab method
nlp.vocab['myself'].is_stop # check "myself" is stopword or not

True

In [None]:
nlp.vocab['mystery'].is_stop

False

### **Make a word as stopword**

In [None]:
# step-1: Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('mystery')

In [None]:
# step-2: Set the stop_word tag on the lexeme
nlp.vocab['mystery'].is_stop = True

In [None]:
len(nlp.Defaults.stop_words)

327

In [None]:
nlp.vocab['mystery'].is_stop

True

### **To remove a stop word**
Alternatively, you may decide that `'beyond'` should not be considered a stop word.

In [None]:
# step-1: Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('and')

# step-2: Remove the stop_word tag from the lexeme
nlp.vocab['and'].is_stop = False

In [None]:
len(nlp.Defaults.stop_words)

325

In [None]:
nlp.vocab['and'].is_stop

False

### **Print all stopwords from a line**

In [None]:
import string
import re
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
# load data
text = 'The Quick brown fox jump over the lazy dog!'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SIDHARTH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [None]:
# split into words
tokens = word_tokenize(text)
print(tokens)

['The', 'Quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [None]:
# convert to lower case
tokens = [w.lower() for w in tokens]
print(tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '!']


In [None]:
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
print(re_punc)

re.compile('[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]')


In [None]:
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]
print(stripped)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '']


In [None]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']


In [None]:
# filter out non-stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [None]:
# Check remaining tokens are stopword or not
nlp.vocab['dog'].is_stop

False