# Instructor Do: Terms Relevance (Understanding TF-IDF)

In [1]:
# Initial imports
import nltk
from nltk.corpus import reuters
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


## Loading Text from the Reuters Dataset

To demonstrate how TF-IDF works, we will use the _Reuters_ dataset that is bundled in NLTK.

In [2]:
# Download/update the Reuters dataset
nltk.download("reuters")



[nltk_data] Downloading package reuters to
[nltk_data]     /Users/josearturomorasoto/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [3]:
# Count the total number of documents in the collection
doc_ids = reuters.fileids()
print(f"Total number of docs in the corpus: {len(doc_ids)}")


Total number of docs in the corpus: 10788


## Getting Bag of Words from a Single Document

We select a single document from the corpus to get it's "Bag of Words". The same can be done from multiple documents by pasing a list of documents (or documents ids on this example) to the `CountVectorizer()` object.

In [4]:
# Select and print the original single document text
doc_id = "test/15045"
doc_text = reuters.raw(doc_id)
print(doc_text)



DUTCH ADJUSTED UNEMPLOYMENT RISES IN MARCH
  Dutch seasonally adjusted unemployment
  rose in the month to end-March to a total 693,000 from 690,600
  at end-February, but was well down from 730,100 at end-March
  1986, Social Affairs Ministry figures show.
      The figure for male jobless rose by 2,000 in the month to
  436,500 compared with 470,700 a year earlier. The figure for
  women was 256,500 at end-March against 256,100 a month earlier
  and 259,400 at end-March 1986.
      On an unadjusted basis total unemployment fell by 16,500 in
  the month to end-March to 692,200. In March 1986 the figure was
  725,000.
      A ministry spokesman said the unadjusted figures showed a
  smaller than usual seasonal decrease for the time of year,
  because of particularly cold weather delaying work in the
  building industry. He said this explained the increase in the
  adjusted statistics.
      Total vacancies available rose by 1,900 to 26,300 at
  end-March. A year earlier the figure was 

In [5]:
# Creating the CountVectorizer instance defining the stopwords in English to be ignored
vectorizer = CountVectorizer(stop_words="english")

# Getting the tokenization and occurrence counting
X = vectorizer.fit_transform([doc_text])

# Retrieve unique words list
words = vectorizer.get_feature_names()
print(words)



['000', '100', '16', '1986', '200', '256', '259', '26', '28', '300', '400', '436', '470', '500', '600', '690', '692', '693', '700', '725', '730', '763', '900', 'adjusted', 'affairs', 'available', 'basis', 'building', 'cold', 'compared', 'decrease', 'delaying', 'dutch', 'earlier', 'end', 'explained', 'february', 'fell', 'figure', 'figures', 'increase', 'industry', 'jobless', 'male', 'march', 'ministry', 'month', 'particularly', 'rises', 'rose', 'said', 'seasonal', 'seasonally', 'showed', 'smaller', 'social', 'spokesman', 'statistics', 'time', 'total', 'unadjusted', 'unemployment', 'usual', 'vacancies', 'weather', 'women', 'work', 'year']


In [6]:
# X raw data contains the occurrence of each term in the document. A unique ID is assigned to each term.
print(X)



  (0, 21)	1
  (0, 8)	1
  (0, 9)	1
  (0, 7)	1
  (0, 22)	1
  (0, 25)	1
  (0, 63)	1
  (0, 57)	1
  (0, 40)	1
  (0, 35)	1
  (0, 41)	1
  (0, 27)	1
  (0, 66)	1
  (0, 31)	1
  (0, 64)	1
  (0, 28)	1
  (0, 47)	1
  (0, 58)	1
  (0, 30)	1
  (0, 51)	1
  (0, 62)	1
  (0, 54)	1
  (0, 53)	1
  (0, 50)	2
  (0, 56)	1
  :	:
  (0, 42)	1
  (0, 43)	1
  (0, 38)	4
  (0, 39)	2
  (0, 45)	2
  (0, 24)	1
  (0, 55)	1
  (0, 3)	3
  (0, 1)	2
  (0, 20)	1
  (0, 36)	1
  (0, 14)	1
  (0, 15)	1
  (0, 0)	3
  (0, 17)	1
  (0, 59)	3
  (0, 34)	7
  (0, 46)	4
  (0, 49)	3
  (0, 52)	1
  (0, 44)	8
  (0, 48)	1
  (0, 61)	3
  (0, 23)	3
  (0, 32)	2


In [7]:
# Getting the bag of words as DataFrame
words_df = pd.DataFrame(
    list(zip(words, np.ravel(X.sum(axis=0)))), columns=["Word", "Word_Count"]
)
words_df


Unnamed: 0,Word,Word_Count
0,000,3
1,100,2
2,16,1
3,1986,3
4,200,1
5,256,2
6,259,1
7,26,1
8,28,1
9,300,1


## Calculating the TF-IDF from a Corpus

In [8]:
# Getting the corpus (first 1000 files from Reuters dataset)
all_docs_id = reuters.fileids()
corpus_id = all_docs_id[0:1000]
corpus = [reuters.raw(doc) for doc in corpus_id]

# Print sample document
print(corpus[50])



NICKEL PRICES UNLIKELY TO RISE MUCH - SHEARSON
  Nickel prices are unlikely to rise
  significantly from current levels unless further steps are
  taken to reduce production, Shearson Lehman Brothers said in
  its quarterly nickel market report.
      The market had recovered slightly to around 1.72 dlrs a lb
  yesterday from its four year low of 1.55 dlrs in early January,
  due to the absence of Soviet nickel cathode deliveries, but
  Shearson sees Soviet shipments soon returning to last year's
  buoyant levels, which should ease current tightness.
      Output reductions by producers will take effect later this
  year but are likely to be offset by increases elsewhere.
      Shearson said the nickel market will be virtually in
  balance during 1987, with total non-Socialist world demand at
  556,000 tonnes, compared with an estimated 544,000 tonnes in
  1986, production at 505,000 tonnes (504,000) and imports from
  Socialist countries at 47,000 tonnes (50,000).
      It forecast pr

In [9]:
# Getting the TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_corpus = vectorizer.fit_transform(corpus)



In [10]:
# Getting matrix info
print(f"Matrix shape: {X_corpus.shape}")
print(f"Total number of documents: {X_corpus.shape[0]}")
print(f"Total number of unique words (tokens): {X_corpus.shape[1]}")



Matrix shape: (1000, 9489)
Total number of documents: 1000
Total number of unique words (tokens): 9489


In [11]:
# Retrieve words list from corpous
words_corpus = vectorizer.get_feature_names()
print(words_corpus)





In [12]:
# Getting the TF-IDF weight of each word in corpus as DataFrame
words_corpus_df = pd.DataFrame(
    list(zip(words_corpus, np.ravel(X_corpus.mean(axis=0)))), columns=["Word", "TF-IDF"]
)

words_corpus_df = words_corpus_df.sort_values(by=["TF-IDF"], ascending=False)



In [13]:
# Highest 10 TF-IDF scores
words_corpus_df.head(10)



Unnamed: 0,Word,TF-IDF
9199,vs,0.079701
5885,mln,0.06146
2971,cts,0.051221
1,000,0.047185
7680,said,0.045466
6083,net,0.038892
3391,dlrs,0.038615
6495,pct,0.028682
7953,shr,0.027749
5538,lt,0.0271


In [14]:
# Lowest 10 TF-IDF scores
words_corpus_df.tail(10)


Unnamed: 0,Word,TF-IDF
3160,denotes,5e-06
5906,modification,5e-06
2144,bulgur,5e-06
2914,cracked,5e-06
20,019,5e-06
402,302,5e-06
1024,893,5e-06
1053,927,5e-06
71,076,5e-06
7600,rolled,5e-06
