# **BBC News Classification with Custom BoW & TF-IDF**

**1. Load BBC Dataset**

In [46]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

df = pd.read_csv("bbc-text.csv", encoding = "latin1")

print(df.head())

Saving bbc-text.csv to bbc-text (2).csv
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


**2. Data Preprocessing**


*   Clean text
*   Tokenize text



In [47]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.lower()
  text = re.sub(r"\d+", " ", text)
  tokens = text.split()
  tokens = [word for word in tokens if word not in stop_words]

  return tokens

df['tokens'] = df['text'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
df

Unnamed: 0,category,text,tokens
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raids, box, office, ocean, twe..."
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,"[cars, pull, us, retail, figures, us, retail, ..."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,"[kilroy, unveils, immigration, policy, exchats..."
2222,entertainment,rem announce new glasgow concert us band rem h...,"[rem, announce, new, glasgow, concert, us, ban..."
2223,politics,how political squabbles snowball it s become c...,"[political, squabbles, snowball, become, commo..."


**3. Bag of Words (BoW)**

*   Build vocabulary from all documents







In [49]:
all_tokens = set()
for tokens in df['tokens']:
    all_tokens.update(tokens)

sorted_tokens = sorted(all_tokens)

vocabulary = {word: idx for idx, word in enumerate(sorted_tokens)}

print(f"Vocabulary size: {len(vocabulary)}")
print("First 20 words:", list(vocabulary.items())[:20])

Vocabulary size: 30171
First 20 words: [('aa', 0), ('aaa', 1), ('aaas', 2), ('aac', 3), ('aadc', 4), ('aaliyah', 5), ('aaltra', 6), ('aamir', 7), ('aan', 8), ('aara', 9), ('aarhus', 10), ('aaron', 11), ('abacus', 12), ('abandon', 13), ('abandoned', 14), ('abandoning', 15), ('abandonment', 16), ('abate', 17), ('abatement', 18), ('abating', 19)]


*   Convert a document to BoW vector






In [56]:
import numpy as np

bag_of_words = []
for tokens in df['tokens']:
    vector = [0] * len(vocabulary)
    for token in tokens:
        if token in vocabulary:
            vector[vocabulary[token]] += 1
    bag_of_words.append(vector)

bag_of_words = np.array(bag_of_words)

df['bow_vector'] = list(bag_of_words)

index_to_word_bow = {index: word for word, index in vocabulary.items()}

first_bow = bag_of_words[0]

first_20_bow = [(index_to_word_bow[i], round(first_bow[i], 4)) for i in range(20)]

print("First 20 words and their counts (first document):")
for word, count in first_20_bow:
    print(f"  {word}: {count}")

First 20 words and their counts (first document):
  aa: 0
  aaa: 0
  aaas: 0
  aac: 0
  aadc: 0
  aaliyah: 0
  aaltra: 0
  aamir: 0
  aan: 0
  aara: 0
  aarhus: 0
  aaron: 0
  abacus: 0
  abandon: 0
  abandoned: 0
  abandoning: 0
  abandonment: 0
  abate: 0
  abatement: 0
  abating: 0


**4. TF-IDF Implementation**


*   Compute Term Frequency (TF)



In [63]:
def compute_tf(tokens, vocabulary):
    tf_vector = np.zeros(len(vocabulary))
    total_words = len(tokens)

    for word in tokens:
        if word in vocabulary:
            tf_vector[vocabulary[word]] += 1

    return tf_vector / total_words

df['tf_vector'] = df['tokens'].apply(lambda x: compute_tf(x, vocabulary))

index_to_word = {idx: word for word, idx in vocabulary.items()}

first_tf = df['tf_vector'].iloc[0]
first_20_tf = [(index_to_word[i], round(first_tf[i], 4)) for i in range(20)]
print("First 20 words and their TF values (first document)")
for word, score in first_20_tf:
    print(f"  {word}: {score:.4f}")

First 20 words and their TF values (first document)
  aa: 0.0000
  aaa: 0.0000
  aaas: 0.0000
  aac: 0.0000
  aadc: 0.0000
  aaliyah: 0.0000
  aaltra: 0.0000
  aamir: 0.0000
  aan: 0.0000
  aara: 0.0000
  aarhus: 0.0000
  aaron: 0.0000
  abacus: 0.0000
  abandon: 0.0000
  abandoned: 0.0000
  abandoning: 0.0000
  abandonment: 0.0000
  abate: 0.0000
  abatement: 0.0000
  abating: 0.0000


*   Compute Inverse Document Frequency (IDF)


In [64]:
import math

def compute_idf(all_data, vocabulary):
    N = len(all_data)
    idf_vector = []
    for word in vocabulary:
        df_count = sum(1 for doc in all_data if word in doc)
        idf = math.log((N + 1) / (df_count + 1)) + 1
        idf_vector.append(idf)
    return np.array(idf_vector)

idf_vector = compute_idf(df['tokens'], vocabulary)

first_20_idf = [(index_to_word[i], round(idf_vector[i], 4)) for i in range(20)]
print("\nFirst 20 words and their IDF values")
for word, score in first_20_idf:
    print(f"  {word}: {score:.4f}")


First 20 words and their IDF values
  aa: 8.0148
  aaa: 6.5107
  aaas: 6.6285
  aac: 7.6093
  aadc: 8.0148
  aaliyah: 8.0148
  aaltra: 8.0148
  aamir: 8.0148
  aan: 8.0148
  aara: 8.0148
  aarhus: 8.0148
  aaron: 6.9162
  abacus: 7.6093
  abandon: 7.3217
  abandoned: 5.8176
  abandoning: 6.6285
  abandonment: 6.9162
  abate: 8.0148
  abatement: 8.0148
  abating: 7.6093




*   Compute TF-IDF vectors for each document


In [65]:
def compute_tfidf(tf_vector, idf_vector):
    return tf_vector * idf_vector

df['tfidf_vector'] = df['tf_vector'].apply(lambda x: compute_tfidf(x, idf_vector))

first_tfidf = df['tfidf_vector'].iloc[0]
first_tfidf = df['tfidf_vector'].iloc[0]

top_indices = np.argsort(first_tfidf)[-20:][::-1]
top_words = [index_to_word[i] for i in top_indices]
top_scores = [first_tfidf[i] for i in top_indices]

print("Top 20 TF-IDF words in first document:")
for w, s in zip(top_words, top_scores):
    print(f"  {w}: {s:.4f}")

Top 20 TF-IDF words in first document:
  tv: 0.0999
  dvr: 0.0748
  hanlon: 0.0591
  highdefinition: 0.0525
  tivo: 0.0489
  want: 0.0472
  watch: 0.0434
  satellite: 0.0425
  content: 0.0404
  brands: 0.0382
  brand: 0.0378
  programmes: 0.0352
  people: 0.0348
  viewers: 0.0346
  us: 0.0341
  schedules: 0.0340
  means: 0.0339
  channel: 0.0338
  technologies: 0.0332
  lcd: 0.0326


**5. Analysis**

*   Top-10 words by average TF-IDF



In [66]:
categories = df['category'].unique()

index_to_word = {idx: word for word, idx in vocabulary.items()}

for category in categories:
    print(f"\nTop TF-IDF Words for category: {category}")

    cat_df = df[df['category'] == category]

    tfidf_matrix = np.stack(cat_df['tfidf_vector'].values)

    avg_tfidf = tfidf_matrix.mean(axis=0)

    sorted_indices = np.argsort(avg_tfidf)[::-1]

    for i in sorted_indices[:10]:
        word = index_to_word[i]
        score = avg_tfidf[i]
        print(f"  {word}: {score:.4f}")


Top TF-IDF Words for category: tech
  people: 0.0174
  said: 0.0169
  users: 0.0149
  software: 0.0147
  mobile: 0.0145
  technology: 0.0133
  microsoft: 0.0121
  net: 0.0114
  digital: 0.0112
  computer: 0.0111

Top TF-IDF Words for category: business
  bn: 0.0243
  said: 0.0206
  us: 0.0164
  growth: 0.0136
  bank: 0.0134
  company: 0.0130
  economy: 0.0125
  year: 0.0125
  market: 0.0125
  sales: 0.0123

Top TF-IDF Words for category: sport
  england: 0.0147
  game: 0.0146
  said: 0.0139
  win: 0.0134
  cup: 0.0124
  match: 0.0115
  club: 0.0109
  injury: 0.0107
  chelsea: 0.0104
  play: 0.0102

Top TF-IDF Words for category: entertainment
  film: 0.0353
  best: 0.0193
  show: 0.0150
  music: 0.0143
  said: 0.0140
  awards: 0.0137
  band: 0.0128
  award: 0.0123
  festival: 0.0117
  album: 0.0116

Top TF-IDF Words for category: politics
  mr: 0.0326
  said: 0.0259
  labour: 0.0239
  blair: 0.0196
  election: 0.0193
  party: 0.0193
  government: 0.0172
  would: 0.0167
  brown: 0.0131



*   high TF/low IDF & low TF/high IDF




In [67]:
X_bow = np.stack(df['bow_vector'].values)
global_tf = np.sum(X_bow, axis=0) / np.sum(X_bow)

index_to_word = {idx: word for word, idx in vocabulary.items()}

tf_idf_df = pd.DataFrame({
    'word': [index_to_word[i] for i in range(len(vocabulary))],
    'global_tf': global_tf,
    'idf': idf_vector
})

high_tf_low_idf = tf_idf_df.sort_values(['global_tf', 'idf'], ascending=[False, True]).head(10)
low_tf_high_idf = tf_idf_df.sort_values(['global_tf', 'idf'], ascending=[True, False]).head(10)

print("High TF & Low IDF words:")
for _, row in high_tf_low_idf.iterrows():
    print(f"  {row['word']}: TF={row['global_tf']:.4f}, IDF={row['idf']:.4f}")

print("\nLow TF & High IDF words:")
for _, row in low_tf_high_idf.iterrows():
    print(f"  {row['word']}: TF={row['global_tf']:.4f}, IDF={row['idf']:.4f}")

High TF & Low IDF words:
  said: TF=0.0151, IDF=1.1642
  mr: TF=0.0062, IDF=2.0347
  would: TF=0.0054, IDF=1.6631
  also: TF=0.0045, IDF=1.5643
  people: TF=0.0043, IDF=2.0259
  new: TF=0.0041, IDF=1.8214
  us: TF=0.0040, IDF=1.9805
  year: TF=0.0039, IDF=1.7992
  one: TF=0.0037, IDF=1.7736
  could: TF=0.0031, IDF=1.9326

Low TF & High IDF words:
  aa: TF=0.0000, IDF=8.0148
  aaltra: TF=0.0000, IDF=8.0148
  aamir: TF=0.0000, IDF=8.0148
  aan: TF=0.0000, IDF=8.0148
  aara: TF=0.0000, IDF=8.0148
  aarhus: TF=0.0000, IDF=8.0148
  abate: TF=0.0000, IDF=8.0148
  abatement: TF=0.0000, IDF=8.0148
  abbot: TF=0.0000, IDF=8.0148
  abbreviated: TF=0.0000, IDF=8.0148


**6. Save cleaned CSV**

In [68]:
output_df = df[['category', 'text', 'tokens']].copy()

high_tf_low_words = [w for w in high_tf_low_idf['word']]
low_tf_high_words = [w for w in low_tf_high_idf['word']]

output_df['High TF/Low IDF'] = ", ".join(high_tf_low_words)
output_df['Low TF/High IDF'] = ", ".join(low_tf_high_words)

output_df.to_csv("cleaned_bbc_text.csv", index=False, encoding='utf-8')