In [1]:
# Import the Reuters database from the nltk corpus 
from nltk.corpus import reuters, stopwords
# Import tokenizers and pandas
from nltk.tokenize import sent_tokenize, word_tokenize
# Import regular expressions
import re

# Import nltk the sentence tokenizer.
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleywise/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Get the second article from the crude category of the Reuters library and print out the article.
crude_article = reuters.raw(fileids=reuters.fileids(categories='crude')[2])
print(crude_article)

TURKEY CALLS FOR DIALOGUE TO SOLVE DISPUTE
  Turkey said today its disputes with
  Greece, including rights on the continental shelf in the Aegean
  Sea, should be solved through negotiations.
      A Foreign Ministry statement said the latest crisis between
  the two NATO members stemmed from the continental shelf dispute
  and an agreement on this issue would effect the security,
  economy and other rights of both countries.
      "As the issue is basicly political, a solution can only be
  found by bilateral negotiations," the statement said. Greece has
  repeatedly said the issue was legal and could be solved at the
  International Court of Justice.
      The two countries approached armed confrontation last month
  after Greece announced it planned oil exploration work in the
  Aegean and Turkey said it would also search for oil.
      A face-off was averted when Turkey confined its research to
  territorrial waters. "The latest crises created an historic
  opportunity to solve th

In [3]:
# Write a function to clean the article using stopwords and regular expressions.
def clean_text(article):
    """
    Preprocesses a given text article by performing the following steps:
    
    1. Removes stopwords (common words in English language).
    2. Uses regular expressions to remove non-alphabet characters (e.g., punctuation).
    3. Tokenizes the cleaned text into words.
    4. Filters out words that are in the stopwords list.
    
    Parameters:
        article (str): The input text article to be processed.

    Returns:
        list of str: A list of preprocessed words from the input article.
    """
    # Get the stopwords
    sw = set(stopwords.words('english'))
    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z ]") 
    re_clean = regex.sub(' ', article)
    # Tokenize the words 
    words = word_tokenize(re_clean)
    # Retrieve only the words that aren't in the stopwords.
    output = [word.lower() for word in words if word.lower() not in sw]
    return output

In [4]:
# Call the function with the article and print out the unique words. 
result = clean_text(crude_article)
print(set(result))

{'basicly', 'month', 'confrontation', 'members', 'turkey', 'would', 'message', 'dispute', 'aegean', 'search', 'approached', 'created', 'contents', 'solved', 'historic', 'shelf', 'averted', 'research', 'said', 'today', 'negotiations', 'economy', 'found', 'exploration', 'planned', 'minister', 'two', 'disputes', 'bilateral', 'continental', 'effect', 'opportunity', 'sea', 'disclosed', 'international', 'ministry', 'foreign', 'reply', 'issue', 'ambassador', 'dialogue', 'repeatedly', 'nazmi', 'papandreou', 'latest', 'week', 'armed', 'greece', 'justice', 'akiman', 'crises', 'legal', 'confined', 'oil', 'statement', 'due', 'also', 'turkish', 'greek', 'face', 'stemmed', 'security', 'sent', 'last', 'athens', 'solution', 'political', 'ozal', 'crisis', 'waters', 'countries', 'solve', 'nato', 'meet', 'agreement', 'could', 'including', 'territorrial', 'calls', 'court', 'work', 'prime', 'turgut', 'rights', 'andreas', 'announced'}


In [5]:
# Write a second function that does the same as the first function, but adds custom stopwords to the NLTK stopwords.
def clean_text_again(article):
    """
    Preprocesses a given text article by performing the following steps:
    
    1. Removes stopwords (common words in English language).
    2. Creates a custom dictionary of stopwords. 
    3. Uses regular expressions to remove non-alphabet characters (e.g., punctuation).
    4. Tokenizes the cleaned text into words.
    5. Filters out words that are not stopwords.
    
    Parameters:
        article (str): The input text article to be processed.

    Returns:
        list of str: A list of preprocessed words from the input article.
    """
    # Get the stopwords
    sw = set(stopwords.words('english'))
    # Create a custom dictionary of stopwords. 
    sw_addons = {'said', 'sent', 'found', 'including', 'today', 'announced', 'week', 'basicly','also'}
    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', article)
    # Tokenize the words 
    words = word_tokenize(re_clean)
    # Retrieve only the words not in the stopwords. Create a union of the sw and sw_addons.
    output = [word.lower() for word in words if word.lower() not in sw.union(sw_addons)]
    return output

In [6]:
# Call the function with the article and print out the unique words.
result2 = clean_text_again(crude_article)
print(set(result2))

{'month', 'confrontation', 'members', 'turkey', 'would', 'message', 'dispute', 'aegean', 'search', 'approached', 'created', 'contents', 'solved', 'historic', 'shelf', 'averted', 'research', 'negotiations', 'economy', 'exploration', 'planned', 'minister', 'two', 'disputes', 'bilateral', 'continental', 'effect', 'opportunity', 'sea', 'disclosed', 'international', 'ministry', 'foreign', 'reply', 'issue', 'ambassador', 'dialogue', 'turkeys', 'repeatedly', 'nazmi', 'papandreou', 'latest', 'armed', 'greece', 'justice', 'akiman', 'crises', 'legal', 'confined', 'oil', 'statement', 'due', 'turkish', 'greek', 'stemmed', 'security', 'last', 'athens', 'solution', 'political', 'ozal', 'crisis', 'waters', 'faceoff', 'countries', 'solve', 'nato', 'meet', 'agreement', 'could', 'territorrial', 'calls', 'court', 'work', 'prime', 'turgut', 'rights', 'andreas'}
