In [6]:
# Import nltk, numpy and pandas.
import nltk
import numpy as np
import pandas as pd
# Import Reuters
from nltk.corpus import reuters
# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import  TfidfVectorizer

# Download the Reuters dataset
nltk.download("reuters")

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/bradleywise/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

## Get all the Articles About Money

In [7]:
# Get the categories
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [8]:
# Get all the "fileids" in the "money-fx" and "money-supply" categories.
categories = ["money-fx", "money-supply"]
docs_id = reuters.fileids()

# Use a list comprehension or for loop to get the all the fieldids.
money_news_ids = [
    doc
    for doc in docs_id
    if categories[0] in reuters.categories(doc)
    or categories[1] in reuters.categories(doc)
]

# Print the total number of news articles about money.
print(f"Total number of news articles about money: {len(money_news_ids)}")

Total number of news articles about money: 883


In [9]:
# Use a list comprehension or for loop to retrieve the text from the corpus containing all the news articles about money.
money_news = [reuters.raw(doc) for doc in money_news_ids]

# Print a sample article
print(money_news[78])

U.S. BANKS LIKELY TO LIFT PRIME RATES AGAIN SOON
  Major U.S. banks may lift prime
  lending rates again within days due to recent increases in
  their borrowing costs and speculation the Federal Reserve is
  nudging up interest rates to help the dollar, economists said.
      In what was the first prime rate boost since mid-1984, most
  banks in early April lifted their rates a quarter point to
  7-3/4 pct, citing a reduced gap between the prime and their own
  cost of money. That spread has narrowed again.
      "A prime rate increase could happen as soon as tonight,"
  said Robert Brusca of Nikko Securities Co International Inc.
      Brusca said a quarter-point prime rate rise to eight pct is
  justified because the spread between banks' cost of funds and
  the prime rate has narrowed to less than three quarters of a
  percentage point.
      He said that spread had averaged around 1.4 percentage
  points since last October until it fell below one point and
  triggered the April pr

## Calculate the TF-IDF Weights

In [10]:
# Create an instance of the TfidfVectorizer and define the English stopwords to be ignored.
vectorizer = TfidfVectorizer(stop_words="english")
# Tokenize the articles about money into numerical features.
X = vectorizer.fit_transform(money_news)

In [17]:
X.shape

(883, 7356)

In [18]:
883*7356

6495348

In [16]:
X.toarray()

array([[0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       ...,
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.065751, 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ]])

In [11]:
# Create a list to hold the words using the vectorizer.get_feature_names_out()
words = list(vectorizer.get_feature_names_out())
# Create a list to hold the frequency using np.ravel(X.sum(axis=0))
frequency = list(np.ravel(X.sum(axis=0)))

In [12]:
# Create a DataFrame of the TF–IDF weights for each word in the working corpus.
money_news_df = pd.DataFrame({
    "Word": words,
    "Frequency": frequency})

# Display the DataFrame
money_news_df.head(10)

Unnamed: 0,Word,Frequency
0,0,3.867438
1,0,1.956415
2,0,0.101317
3,2,0.076203
4,3,0.171025
5,4,0.188415
6,6913,0.055155
7,6916,0.076118
8,7050,0.076118
9,7100,0.055155


In [13]:
money_news_df.tail(10)

Unnamed: 0,Word,Frequency
7346,zero,0.381634
7347,zhejiang,0.11776
7348,zimbabwe,0.610475
7349,zimoil,0.119876
7350,zoete,0.05555
7351,zone,0.440092
7352,zones,1.75615
7353,zurich,0.43158
7354,zwermann,0.057754
7355,üside,0.060045


In [14]:
# Sort the DataFrame by word frequency in descending order and reset the index.
money_news_df = money_news_df.sort_values(by=["Frequency"], ascending=False).reset_index(drop=True)

# Print the top 10 words
money_news_df.head(10)

Unnamed: 0,Word,Frequency
0,said,54.918051
1,mln,51.533825
2,bank,49.568996
3,stg,47.236863
4,billion,43.544274
5,pct,41.917193
6,dollar,37.17879
7,fed,36.860352
8,dlrs,36.273205
9,market,35.086673


In [21]:
# Alternative: Create a DataFrame of the TF–IDF weights for each term in the working corpus. 
money_news_df = pd.DataFrame(
    list(zip(vectorizer.get_feature_names_out(), np.ravel(X.sum(axis=0)))),
    columns=["Word", "Frequency"],
)

# Sort the DataFrame by word frequency in descending order and reset the index..
money_news_df = money_news_df.sort_values(by=["Frequency"], ascending=False).reset_index(drop=True)

# Print the top 10 words
money_news_df.head(10)

Unnamed: 0,Word,Frequency
0,said,54.918051
1,mln,51.533825
2,bank,49.568996
3,stg,47.236863
4,billion,43.544274
5,pct,41.917193
6,dollar,37.17879
7,fed,36.860352
8,dlrs,36.273205
9,market,35.086673


## How many documents contains a specific word or group of words?

In [22]:
# Write a function called `retrieve_docs(terms)`that searches the "money_news_ids" list 
# and retrieves the number of articles for a given word or group of words.
def retrieve_docs(terms):
    """
    Retrieve a list of document IDs that contain at least one of the specified terms.

    This function searches through a collection of documents represented by 'money_news_ids'
    to identify documents that contain at least one of the provided terms. It utilizes the
    NLTK Reuters corpus and tokenizes each document to find matches based on the lowercase
    representation of words.

    Parameters:
    terms (list of str): A list of terms to search for within the documents.

    Returns:
    list of str: A list of document IDs that contain at least one of the specified terms.

    Example:
    >>> retrieve_docs(['stock', 'market', 'invest'])
    ['doc1', 'doc3', 'doc5']
    """
    # Create an empty list to hold the results.
    result_docs = []
    # Use a for loop to loop through the money_news_ids.
    for doc_id in money_news_ids:
        # Use a list comprehension or a for loop to extract words from the document using reuters.words(doc_id)) 
        # then populates list comprehension using a conditional statement that checks for any words in 
        # lowercase matching the "terms" passed to the function.
        found_terms = [word for word in reuters.words(doc_id) if any(term in word.lower() for term in terms)]
        # Use a conditional statement that checks whether there are is at least one term from the input 
        # list that was found in the document. If it is found, the append the article id to the list. 
        if len(found_terms) > 0:
            result_docs.append(doc_id)
    return result_docs

 ### Question 1: How many articles talk about Yen?

In [24]:
retrieve_docs(["yen"])

['test/14890',
 'test/14913',
 'test/14987',
 'test/15431',
 'test/15442',
 'test/15450',
 'test/15453',
 'test/15460',
 'test/15625',
 'test/15677',
 'test/15689',
 'test/16053',
 'test/16066',
 'test/16067',
 'test/16068',
 'test/16069',
 'test/16072',
 'test/16106',
 'test/16111',
 'test/16177',
 'test/16189',
 'test/16190',
 'test/16565',
 'test/16744',
 'test/16754',
 'test/16755',
 'test/16779',
 'test/17041',
 'test/17044',
 'test/17047',
 'test/17871',
 'test/17980',
 'test/18360',
 'test/18363',
 'test/18370',
 'test/18674',
 'test/19061',
 'test/20001',
 'test/20862',
 'test/21202',
 'test/21542',
 'test/21573',
 'training/10263',
 'training/10308',
 'training/10323',
 'training/10337',
 'training/10357',
 'training/10358',
 'training/10359',
 'training/10364',
 'training/10370',
 'training/10382',
 'training/10475',
 'training/10546',
 'training/10617',
 'training/10650',
 'training/10651',
 'training/10652',
 'training/10654',
 'training/10659',
 'training/10660',
 'trainin

In [11]:
len(retrieve_docs(["yen"]))

182

### Question 2: How many articles talk about Japan or Banks?

In [12]:
len(retrieve_docs(["japan", "banks"]))

326

 ### Question 3: How many articles talk about England or Dealers?

In [13]:
len(retrieve_docs(["england", "dealers"]))

299