In [1]:
# Import the Reuters database from the nltk corpus 
from nltk.corpus import reuters, stopwords
# Import tokenizers and pandas
from nltk.tokenize import sent_tokenize, word_tokenize
# Import regular expressions
import re

# Import nltk and the sentence tokenizer.
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleywise/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Get the second article from the crude category of the Reuters library and print out the article.
article_id = reuters.fileids(categories='crude')[1]
crude_article = reuters.raw(article_id)

crude_article[:1000]

'ENERGY/U.S. PETROCHEMICAL INDUSTRY\n  Cheap oil feedstocks, the weakened U.S.\n  dollar and a plant utilization rate approaching 90 pct will\n  propel the streamlined U.S. petrochemical industry to record\n  profits this year, with growth expected through at least 1990,\n  major company executives predicted.\n      This bullish outlook for chemical manufacturing and an\n  industrywide move to shed unrelated businesses has prompted GAF\n  Corp &lt;GAF>, privately-held Cain Chemical Inc, and other firms\n  to aggressively seek acquisitions of petrochemical plants.\n      Oil companies such as Ashland Oil Inc &lt;ASH>, the\n  Kentucky-based oil refiner and marketer, are also shopping for\n  money-making petrochemical businesses to buy.\n      "I see us poised at the threshold of a golden period," said\n  Paul Oreffice, chairman of giant Dow Chemical Co &lt;DOW>, adding,\n  "There\'s no major plant capacity being added around the world\n  now. The whole game is bringing out new products a

In [9]:
print(len(stopwords.words('english')))
print(len(set(stopwords.words('english'))))

179
179


In [15]:
# Write a function to clean the article using stopwords and regular expressions.
def clean_text(article):
    """
    Preprocesses a given text article by performing the following steps:
    
    1. Removes stopwords (common words in English language).
    2. Uses regular expressions to remove non-alphabet characters (e.g., punctuation).
    3. Tokenizes the cleaned text into words.
    4. Filters out words that are in the stopwords list.
    
    Parameters:
        article (str): The input text article to be processed.

    Returns:
        list of str: A list of preprocessed words from the input article.
    """
    # Get the stopwords
    sw = set(stopwords.words('english'))
    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z]")
    re_clean = regex.sub(' ', article)
    # Tokenize the words 
    words = word_tokenize(re_clean) # ["the", "dog", ate, food, the, man, ate, food]
    
    # Retrieve only the words that aren't in the stopwords.
    output = [word.lower() for word in words if word.lower() not in sw]
    
    return output 

In [16]:
# Call the function with the article and print out the unique words. 
clean_text(crude_article)

['energy',
 'u',
 'petrochemical',
 'industry',
 'cheap',
 'oil',
 'feedstocks',
 'weakened',
 'u',
 'dollar',
 'plant',
 'utilization',
 'rate',
 'approaching',
 'pct',
 'propel',
 'streamlined',
 'u',
 'petrochemical',
 'industry',
 'record',
 'profits',
 'year',
 'growth',
 'expected',
 'least',
 'major',
 'company',
 'executives',
 'predicted',
 'bullish',
 'outlook',
 'chemical',
 'manufacturing',
 'industrywide',
 'move',
 'shed',
 'unrelated',
 'businesses',
 'prompted',
 'gaf',
 'corp',
 'lt',
 'gaf',
 'privately',
 'held',
 'cain',
 'chemical',
 'inc',
 'firms',
 'aggressively',
 'seek',
 'acquisitions',
 'petrochemical',
 'plants',
 'oil',
 'companies',
 'ashland',
 'oil',
 'inc',
 'lt',
 'ash',
 'kentucky',
 'based',
 'oil',
 'refiner',
 'marketer',
 'also',
 'shopping',
 'money',
 'making',
 'petrochemical',
 'businesses',
 'buy',
 'see',
 'us',
 'poised',
 'threshold',
 'golden',
 'period',
 'said',
 'paul',
 'oreffice',
 'chairman',
 'giant',
 'dow',
 'chemical',
 'co',
 

In [18]:
set(['and', 'and'])

{'and'}

In [21]:
# Write a second function that does the same as the first function, but adds custom stopwords to the NLTK stopwords.
def clean_text_again(article):
    """
    Preprocesses a given text article by performing the following steps:
    
    1. Removes stopwords (common words in English language).
    2. Creates a custom dictionary of stopwords. 
    3. Uses regular expressions to remove non-alphabet characters (e.g., punctuation).
    4. Tokenizes the cleaned text into words.
    5. Filters out words that are not stopwords.
    
    Parameters:
        article (str): The input text article to be processed.

    Returns:
        list of str: A list of preprocessed words from the input article.
    """
    # Get the stopwords
    sw = set(stopwords.words('english'))
    sw_addons = set(['i', 'you', 'and', 'the'])
    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z]")
    re_clean = regex.sub(' ', article)
    # Tokenize the words 
    words = word_tokenize(re_clean) # ["the", "dog", ate, food, the, man, ate, food]
    
    # Retrieve only the words that aren't in the stopwords.
    output = [word.lower() for word in words if word.lower() not in sw.union(sw_addons)]
    
    return output 

In [22]:
# Call the function with the article and print out the unique words.
print(clean_text_again(crude_article))

['energy', 'u', 'petrochemical', 'industry', 'cheap', 'oil', 'feedstocks', 'weakened', 'u', 'dollar', 'plant', 'utilization', 'rate', 'approaching', 'pct', 'propel', 'streamlined', 'u', 'petrochemical', 'industry', 'record', 'profits', 'year', 'growth', 'expected', 'least', 'major', 'company', 'executives', 'predicted', 'bullish', 'outlook', 'chemical', 'manufacturing', 'industrywide', 'move', 'shed', 'unrelated', 'businesses', 'prompted', 'gaf', 'corp', 'lt', 'gaf', 'privately', 'held', 'cain', 'chemical', 'inc', 'firms', 'aggressively', 'seek', 'acquisitions', 'petrochemical', 'plants', 'oil', 'companies', 'ashland', 'oil', 'inc', 'lt', 'ash', 'kentucky', 'based', 'oil', 'refiner', 'marketer', 'also', 'shopping', 'money', 'making', 'petrochemical', 'businesses', 'buy', 'see', 'us', 'poised', 'threshold', 'golden', 'period', 'said', 'paul', 'oreffice', 'chairman', 'giant', 'dow', 'chemical', 'co', 'lt', 'dow', 'adding', 'major', 'plant', 'capacity', 'added', 'around', 'world', 'whole'