# Stopwords
Stopwords are non-informative words that we want to take out of the text before performing analysis on it. Luckily, NLTK has a ready-made list of such words that we can use to preprocess text...but is this enough?

In [1]:
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
article = reuters.raw(fileids=reuters.fileids(categories='crude')[0])

In [3]:
print(article)

JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
  in the fiscal year ended March

In [4]:
sentence = sent_tokenize(article)[1]
print(sentence)

MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.


In [5]:
words = word_tokenize(sentence)
print(words)

['MITI', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', 'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres', '(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.']


## NLTK Stopwords

In [7]:
# NLTK list of stopwords
sw = set(stopwords.words('english'))
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [8]:
# Instantiate the list of stopwords, then convert all to lowercase
first_result = [word.lower() for word in words if word.lower() not in sw]

In [9]:
# Print result
print(f'{first_result}')

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', 'mln', 'kilolitres', '(', 'kl', ')', '600', 'mln', ',', 'said', '.']


In [11]:
# We can define our own list of stopwords to add to the default nltk words
sw_addons = {'said', 'sent', 'found', 'including', 'today', 'announced', 'week', 'basically', 'also'}

In [12]:
# Print result
output = [word.lower() for word in words if word.lower() not in sw.union(sw_addons)]
print(f'{output}')

['miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', '2000', '550', 'mln', 'kilolitres', '(', 'kl', ')', '600', 'mln', ',', '.']


## Getting Rid of Non-Alpha Characters: Regex

In [14]:
# Import regular expressions library
import re

In [15]:
# Substitute everything that is NOT a letter with empty string
import re
regex = re.compile("[^a-zA-Z ]")

In [22]:
# Tokenize re_clean, convert to lower case, and remove stop words 

re_clean = regex.sub('', article)
#print(f'{re_clean}')
sentence = sent_tokenize(re_clean)
#print(f'{sentence}')
words = word_tokenize(sentence[0])
output = [word.lower() for word in words if word.lower() not in sw.union(sw_addons)]
# Print result
print(f'{output}')

['japan', 'revise', 'longterm', 'energy', 'demand', 'downwards', 'ministry', 'international', 'trade', 'industry', 'miti', 'revise', 'longterm', 'energy', 'supplydemand', 'outlook', 'august', 'meet', 'forecast', 'downtrend', 'japanese', 'energy', 'demand', 'ministry', 'officials', 'miti', 'expected', 'lower', 'projection', 'primary', 'energy', 'supplies', 'year', 'mln', 'kilolitres', 'kl', 'mln', 'decision', 'follows', 'emergence', 'structural', 'changes', 'japanese', 'industry', 'following', 'rise', 'value', 'yen', 'decline', 'domestic', 'electric', 'power', 'demand', 'miti', 'planning', 'work', 'revised', 'energy', 'supplydemand', 'outlook', 'deliberations', 'committee', 'meetings', 'agency', 'natural', 'resources', 'energy', 'officials', 'miti', 'review', 'breakdown', 'energy', 'supply', 'sources', 'oil', 'nuclear', 'coal', 'natural', 'gas', 'nuclear', 'energy', 'provided', 'bulk', 'japans', 'electric', 'power', 'fiscal', 'year', 'ended', 'march', 'supplying', 'estimated', 'pct', 'k