# Lesson 3. TF‑IDF: from raw counts to meaningful weights

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter


## 1) Dataset

In [2]:
corpus = ['Finance wants to compare actuals versus budget for Q3.',
          'The analytics team needs clean data before the board meeting.',
          'Please consolidate invoices and send a summary by Friday.',
          'We need headcount approval before posting the new role.',
          'Logistics reported delays at the Rotterdam port this week.',
          'Procurement requested three quotes for the same supplier batch.',
          'The client asked for a revised scope and updated timeline.',
          'We are over budget; include risk notes in the report.',
          'Legal needs a redline of the latest contract draft.',
          'The CFO wants a dashboard of KPIs for monthly close.',
          'The ERP migration requires a full data reconciliation.',
          'Sales says the pipeline slipped due to seasonality.',
          'HR will update the payroll rules for contractors.',
          'The integration team is validating the chart of accounts.',
          'We should create a runbook for the month-end process.',
          'Please archive outdated SOPs and tag the current versions.',
          'Support escalated a P1 incident affecting invoices.',
          'The forecasting model needs recalibration after the merger.',
          'Security flagged a misconfigured S3 bucket in staging.',
          'The PMO wants standardized status updates every Tuesday.',
          'Our vendor portal requires multi-factor authentication next month.',
          'Please normalize currency values to USD before analysis.',
          'We need to document acceptance criteria for the new feature.',
          'Stakeholders asked for scenario analysis on OPEX reduction.',
          'The BI refresh failed due to a broken data source.',
          'We should anonymize PII fields prior to sharing the dataset.',
          'Cash flow projections must include deferred revenue.',
          'The auditor requested evidence for three random samples.',
          'We plan to A/B test the onboarding flow next release.',
          'Please schedule a retrospective to capture learnings.']

# To DO: print the length of the corpus, and the first five sentences


## 2) Why TF-IDF? 

We'll construct TF-IDF in small steps on a list of 30 sentences (called "corpus") so the maths is clear.


### Terminology

- **TF (term frequency)** – how often a word (i.e. a term) appears in a document (i.e. a sentence).  
  We’ll use a **length-normalised** version:  
  $$
  \mathrm{TF}(t, d) = \frac{\text{count of term }t\text{ in document }d}{\text{total number of terms in document }d}
  $$

- **DF (document frequency)** – in how many documents (sentences) a term appears.

- **IDF (inverse document frequency)** – a log penalty for terms common across many docs:  
  $$
  \mathrm{IDF}(t) = \log\!\left(\frac{1 + N}{1 + \mathrm{df}(t)}\right) + 1
  $$
  where **N** is the total number of documents and **df(t)** is the number of documents containing term *t*.  


### Putting it together

The **TF-IDF weight** for a given term *t* in document (sentence) *d* is:

$$
\mathrm{TF\!-\!IDF}(t, d) = \mathrm{TF}(t, d) \times \mathrm{IDF}(t)
$$

Intuitively:
- **TF** measures how important a word is within one document (sentence).
- **IDF** reduces the influence of words that appear in every document.
- Together they highlight words that are **frequent in this document but rare in others**.


**Example intuition:**

| Word | Appears in sentence | Appears across all sentences | TF | IDF | TF×IDF |
|------|----------------|--------------------|----|-----|--------|
| “finance” | 3 times | 20 docs | **TO DO** | low | **TO DO** |
| “overrun” | 2 times | 1 doc  | medium | **TO DO** | **high** |
| “the” | 8 times | 30 docs | **TO DO** | very low | **TO DO** |


## 3) Applying TF-IDF on a very small sample

In [5]:
toy_docs = [
    "the budget overrun in the finance report",
    "finance wants actuals versus budget",
    "logistics report delayed this week"
]



toy_tokens = [doc.lower().split() for doc in toy_docs]# TO DO: tokenise by a simple split on lowercase words
vocab = sorted({w for doc in toy_tokens for w in doc})
idx = {w:i for i,w in enumerate(vocab)}

# Count matrix: rows=docs, cols=terms
X_counts_toy = np.zeros((len(toy_docs), len(vocab)), dtype=int)
for r, doc in enumerate(toy_tokens):
    c = Counter(doc)
    for w, cnt in c.items():
        X_counts_toy[r, idx[w]] = cnt

print("Vocab:", vocab)
print("Counts matrix shape:", X_counts_toy.shape)
X_counts_toy


Vocab: ['actuals', 'budget', 'delayed', 'finance', 'in', 'logistics', 'overrun', 'report', 'the', 'this', 'versus', 'wants', 'week']
Counts matrix shape: (3, 13)


array([[0, 1, 0, 1, 1, 0, 1, 1, 2, 0, 0, 0, 0],
       [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]])

In [6]:
# Length-normalised Term frequency (TF): count / total words in the doc
doc_lengths = X_counts_toy.sum(axis=1, keepdims=True)#TO DO: get the count of each document
TF_toy = X_counts_toy / doc_lengths
print("TF for doc 0 (pairs term:tf):")
[(vocab[j], float(TF_toy[0, j])) for j in np.where(TF_toy[0]>0)[0]]


TF for doc 0 (pairs term:tf):


[('budget', 0.14285714285714285),
 ('finance', 0.14285714285714285),
 ('in', 0.14285714285714285),
 ('overrun', 0.14285714285714285),
 ('report', 0.14285714285714285),
 ('the', 0.2857142857142857)]

In [7]:
DF_toy = np.count_nonzero(X_counts_toy > 0, axis=0) # TO DO: count in how many documents each term appears
N = X_counts_toy.shape[0]
IDF_toy = np.log(N / DF_toy) #TO DO: apply inverse doc frequency formula from section 2 (log is the natural log, not base 10)

pairs = list(zip(vocab, DF_toy.tolist(), IDF_toy.tolist()))
print("Term  DF  IDF")
for t, df, idf in pairs:
    print(f"{t:>10} {df:>3} {idf:7.3f}")


Term  DF  IDF
   actuals   1   1.099
    budget   2   0.405
   delayed   1   1.099
   finance   2   0.405
        in   1   1.099
 logistics   1   1.099
   overrun   1   1.099
    report   2   0.405
       the   1   1.099
      this   1   1.099
    versus   1   1.099
     wants   1   1.099
      week   1   1.099


In [8]:
TFIDF_toy = TF_toy * IDF_toy  
doc_id = 0
top = sorted([(vocab[j], TFIDF_toy[doc_id, j]) 
              for j in np.where(TFIDF_toy[doc_id]>0)[0]],
             key=lambda x: x[1], reverse=True)
print("Top terms in doc 0 by TF-IDF (unnormalised):") # TO DO: try to explain why it's called "unnormalised"
top


Top terms in doc 0 by TF-IDF (unnormalised):


[('the', np.float64(0.31388922533374564)),
 ('in', np.float64(0.15694461266687282)),
 ('overrun', np.float64(0.15694461266687282)),
 ('budget', np.float64(0.05792358687259491)),
 ('finance', np.float64(0.05792358687259491)),
 ('report', np.float64(0.05792358687259491))]

## 4) Build CountVectorizer and TfidfVectorizer using the sklearn package

In [9]:
count_vec = CountVectorizer(lowercase=True) # TO DO: build count vectorizer 
tfidf_vec = TfidfVectorizer(lowercase=True)# TO DO: build tf-idf vectorizer 

X_count = count_vec.fit_transform(corpus)
X_tfidf = tfidf_vec.fit_transform(corpus)

X_count.shape, X_tfidf.shape


((30, 178), (30, 178))

## 5) Compare cosine similarities (Count vs TF‑IDF)

In [10]:
sim_count = cosine_similarity(X_count)#TO DO: build cosine similarity on X_count 
sim_tfidf = cosine_similarity(X_tfidf)#TO DO: build cosine similarity on X_tfidf

sim_count.shape, sim_tfidf.shape


((30, 30), (30, 30))

## 6) most_similar(query, top_k)

In [11]:
def most_similar(query, vectorizer, matrix, top_k=3):
    q = vectorizer.transform([query])
    sims = cosine_similarity(q, matrix).flatten()# TO DO: build cosine similarity between new query and existing matrix 
    order = np.argsort(sims)[::-1][:top_k]
    return [(int(i), float(sims[i])) for i in order]

## 7) Try a query

In [12]:
query = "We are over budget, include this in the finance report"
top_count = most_similar(query, count_vec, X_count, top_k=3) # TO DO: apply most_similar function on count vectorizer 
top_tfidf = most_similar(query, tfidf_vec, X_tfidf, top_k=3)# TO DO: apply most_similar function on tf-idf vectorizer
top_count, top_tfidf

([(7, 0.7999999999999999),
  (14, 0.21081851067789195),
  (28, 0.21081851067789195)],
 [(7, 0.7486225529710736), (0, 0.23813860693437672), (4, 0.14037177217188426)])