In [23]:
# Import nltk, numpy and pandas.
import nltk
import numpy as np
import pandas as pd
# Import Reuters
from nltk.corpus import reuters
# Import CountVectorizer, TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download the Reuters dataset if you didn't install it.
# nltk.download("reuters")

In [24]:
# Get the categories
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [25]:
# Count the total number of documents in the collection
doc_ids = reuters.fileids()
# Retrieve the number of documents in the corpus.
print(f"Total number of docs in the corpus: {len(doc_ids)}")

Total number of docs in the corpus: 10788


## Count the occurrence of each word in the text.

In [26]:
# Select and print a single document of text.
doc_text = reuters.raw(doc_ids[1])
print(doc_text)

CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS
  A survey of 19 provinces and seven cities
  showed vermin consume between seven and 12 pct of China's grain
  stocks, the China Daily said.
      It also said that each year 1.575 mln tonnes, or 25 pct, of
  China's fruit output are left to rot, and 2.1 mln tonnes, or up
  to 30 pct, of its vegetables. The paper blamed the waste on
  inadequate storage and bad preservation methods.
      It said the government had launched a national programme to
  reduce waste, calling for improved technology in storage and
  preservation, and greater production of additives. The paper
  gave no further details.
  




In [27]:
# Create an instance of the CountVectorizer and define the English stopwords to be ignored.
vectorizer = CountVectorizer(stop_words='english')

# Tokenize the text into numerical features and occurrence of each word.
X = vectorizer.fit_transform([doc_text])

In [28]:
# X contains the occurrence of each term in the document.
# We have 1 document, the first number in the tuple represents the document number, i.e., 0.
# The second number in the tuple represents the index of the word in the vocabulary created by fit_transform.
# The last number represents how many times the word appears.
print(X)

  (0, 9)	4
  (0, 12)	2
  (0, 37)	1
  (0, 46)	2
  (0, 14)	1
  (0, 0)	2
  (0, 29)	4
  (0, 18)	2
  (0, 40)	2
  (0, 42)	1
  (0, 1)	1
  (0, 33)	1
  (0, 38)	2
  (0, 10)	1
  (0, 39)	1
  (0, 11)	1
  (0, 36)	3
  (0, 48)	1
  (0, 4)	1
  (0, 25)	2
  (0, 44)	2
  (0, 2)	1
  (0, 15)	1
  (0, 27)	1
  (0, 23)	1
  (0, 35)	1
  (0, 3)	1
  (0, 45)	1
  (0, 28)	2
  (0, 7)	1
  (0, 47)	2
  (0, 21)	1
  (0, 41)	2
  (0, 6)	1
  (0, 30)	2
  (0, 24)	1
  (0, 17)	1
  (0, 22)	1
  (0, 26)	1
  (0, 32)	1
  (0, 34)	1
  (0, 8)	1
  (0, 20)	1
  (0, 43)	1
  (0, 19)	1
  (0, 31)	1
  (0, 5)	1
  (0, 16)	1
  (0, 13)	1


In [29]:
# Retrieve unique words list
words = vectorizer.get_feature_names_out()
print(words)

['12' '19' '25' '30' '575' 'additives' 'bad' 'blamed' 'calling' 'china'
 'cities' 'consume' 'daily' 'details' 'eat' 'fruit' 'gave' 'government'
 'grain' 'greater' 'improved' 'inadequate' 'launched' 'left' 'methods'
 'mln' 'national' 'output' 'paper' 'pct' 'preservation' 'production'
 'programme' 'provinces' 'reduce' 'rot' 'said' 'says' 'seven' 'showed'
 'stocks' 'storage' 'survey' 'technology' 'tonnes' 'vegetables' 'vermin'
 'waste' 'year']


In [30]:
# Get the length of the words and find a specific word or term.
print(len(words))
print(words[20])

49
improved


In [31]:
type(X)

scipy.sparse._csr.csr_matrix

In [32]:
# X.toarray()

In [33]:
# Print the number of times each word appears from the document. 
X.toarray()[0]

array([2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 2, 1, 2, 2, 1, 1,
       2, 1, 2, 2, 1])

In [34]:
# Convert the sparse matrix to a DataFrame to get our Bag-of-Words for the document. 
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(bow_df.shape)

# Display some first 20 columns of the DataFrame.
bow_df.iloc[:, :20]

(1, 49)


Unnamed: 0,12,19,25,30,575,additives,bad,blamed,calling,china,cities,consume,daily,details,eat,fruit,gave,government,grain,greater
0,2,1,1,1,1,1,1,1,1,4,1,1,2,1,1,1,1,1,2,1


In [35]:
# Melt the Bag-of-Words DataFrame to convert columns into rows.
melted_bow = bow_df.melt(var_name='Word', value_name="Word_Counts")
melted_bow.head()

Unnamed: 0,Word,Word_Counts
0,12,2
1,19,1
2,25,1
3,30,1
4,575,1


In [36]:
# Sort the DataFrame by Word_Counts if needed
sorted_bow = melted_bow.sort_values(by='Word_Counts', ascending=False).reset_index(drop=True)
sorted_bow.head()

Unnamed: 0,Word,Word_Counts
0,china,4
1,pct,4
2,said,3
3,12,2
4,paper,2


In [37]:
# Alternatively you can do the following:
# Create a list to hold the words using the vectorizer.get_feature_names_out()
words = list(vectorizer.get_feature_names_out())
# Create a list to hold the frequency using np.ravel(X.sum(axis=0))
frequency = list(np.ravel(X.sum(axis=0)))

# Create a DataFrame of the TF–IDF weights for each word in the working corpus.
words_df = pd.DataFrame({
  "Word": words,
  "Word_Count": frequency})

# Alternatively you can use:
# words_df = pd.DataFrame(list(zip(words, np.ravel(X.sum(axis=0)))), columns=["Word", "Word_Count"])

# Sort the DataFrame on the Word_Count in descending order and reset the index.
sorted_words= words_df.sort_values(by=["Word_Count"], ascending=False).reset_index(drop=True)
sorted_words.head()

Unnamed: 0,Word,Word_Count
0,china,4
1,pct,4
2,said,3
3,12,2
4,paper,2


## Calculate the TF-IDF score from a Corpus of Documents.

In [39]:
# Getting the first 1000 articles from Reuters.
corpus_id = doc_ids[0:1000]
corpus = [reuters.raw(doc) for doc in corpus_id]

# Print sample document
print(corpus[50])

NICKEL PRICES UNLIKELY TO RISE MUCH - SHEARSON
  Nickel prices are unlikely to rise
  significantly from current levels unless further steps are
  taken to reduce production, Shearson Lehman Brothers said in
  its quarterly nickel market report.
      The market had recovered slightly to around 1.72 dlrs a lb
  yesterday from its four year low of 1.55 dlrs in early January,
  due to the absence of Soviet nickel cathode deliveries, but
  Shearson sees Soviet shipments soon returning to last year's
  buoyant levels, which should ease current tightness.
      Output reductions by producers will take effect later this
  year but are likely to be offset by increases elsewhere.
      Shearson said the nickel market will be virtually in
  balance during 1987, with total non-Socialist world demand at
  556,000 tonnes, compared with an estimated 544,000 tonnes in
  1986, production at 505,000 tonnes (504,000) and imports from
  Socialist countries at 47,000 tonnes (50,000).
      It forecast pr

In [40]:
# Create an instance of the TfidfVectorizer and define the English stopwords to be ignored.
vectorizer = TfidfVectorizer(stop_words="english")

# Tokenize the 1,000 articles into numerical features.
X_corpus = vectorizer.fit_transform(corpus)

In [41]:
# Print the sparse matrix of the transformed data.
# We have 1,000 documents, the first number in the tuple represents the document number.
# The second number in the tuple represents the index of the word in the vocabulary created by fit_transform.
# The last number represents the value of the TF-IDF score for the vocabulary word.
print(X_corpus)

  (0, 3653)	0.02349374154800046
  (0, 3572)	0.03401531227145892
  (0, 9282)	0.023258637913775613
  (0, 9251)	0.02895058701961889
  (0, 5736)	0.031788665087932794
  (0, 5876)	0.04269766365167542
  (0, 5004)	0.02382046223442031
  (0, 5281)	0.04421016230849752
  (0, 5595)	0.04421016230849752
  (0, 8051)	0.03951191235503726
  (0, 5798)	0.037999413698215155
  (0, 7395)	0.04041700708351447
  (0, 3186)	0.07352722462649618
  (0, 6900)	0.03152282152841889
  (0, 7263)	0.03871356222491918
  (0, 4079)	0.031267012229669745
  (0, 1746)	0.04890841226195779
  (0, 6009)	0.04890841226195779
  (0, 9449)	0.04890841226195779
  (0, 6839)	0.03571875713005421
  (0, 3204)	0.029906837778906972
  (0, 8301)	0.036221071481879155
  (0, 5719)	0.03951191235503726
  (0, 3623)	0.04146186226670836
  (0, 8287)	0.04890841226195779
  :	:
  (999, 6787)	0.1736525446693858
  (999, 7495)	0.23299949141320167
  (999, 7225)	0.11157842602334765
  (999, 6428)	0.09972089581992193
  (999, 2491)	0.3605906417869579
  (999, 7187)	0.0868

In [43]:
# Get the matrix info.
print(f"Matrix shape: {X_corpus.shape}")
print(f"Total number of documents: {X_corpus.shape[0]}")
print(f"Total number of unique words (tokens): {X_corpus.shape[1]}")

Matrix shape: (1000, 9489)
Total number of documents: 1000
Total number of unique words (tokens): 9489


In [45]:
type(X_corpus)

scipy.sparse._csr.csr_matrix

In [44]:
X_corpus.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.26172577, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [46]:
# Retrieve words list from corpus
words_corpus = vectorizer.get_feature_names_out()
print(words_corpus)

['00' '000' '0000' ... 'zones' 'zurich' 'zy']


In [None]:
X_corpus

In [47]:
# Get the TF-IDF weights of each word in corpus as DataFrame
words_corpus_df = pd.DataFrame(
    list(zip(words_corpus, np.ravel(X_corpus.mean(axis=0)))), columns=["Word", "TF-IDF"])

# Sort the DataFrame to show the top TF-IDF values.
sorted_words_corpus = words_corpus_df.sort_values(by=["TF-IDF"], ascending=False).reset_index(drop=True)

# Highest 10 TF-IDF scores
sorted_words_corpus.head(10)

Unnamed: 0,Word,TF-IDF
0,vs,0.079701
1,mln,0.06146
2,cts,0.051221
3,000,0.047185
4,said,0.045466
5,net,0.038892
6,dlrs,0.038615
7,pct,0.028682
8,shr,0.027749
9,lt,0.0271


In [48]:
# Lowest 10 TF-IDF scores
sorted_words_corpus.tail(10)

Unnamed: 0,Word,TF-IDF
9479,denotes,5e-06
9480,modification,5e-06
9481,bulgur,5e-06
9482,cracked,5e-06
9483,019,5e-06
9484,302,5e-06
9485,893,5e-06
9486,927,5e-06
9487,076,5e-06
9488,rolled,5e-06
