In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df = pd.read_csv("bbc-text.csv")

df.head()


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [12]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['text'].apply(preprocess)
df.head()


Unnamed: 0,category,text,tokens
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raids, box, office, ocean, twe..."


In [13]:
def build_vocab(docs):
    vocab = set()
    for doc in docs:
        vocab.update(doc)
    return sorted(list(vocab))

def doc_to_bow(doc, vocab):
    vector = [0] * len(vocab)
    word_count = Counter(doc)
    for idx, word in enumerate(vocab):
        vector[idx] = word_count[word]
    return vector

vocab = build_vocab(df['tokens'])

df['bow'] = df['tokens'].apply(lambda x: doc_to_bow(x, vocab))


In [14]:
def compute_tf(doc, vocab):
    tf = [0] * len(vocab)
    word_count = Counter(doc)
    total_words = len(doc)
    for idx, word in enumerate(vocab):
        tf[idx] = word_count[word] / total_words if total_words > 0 else 0
    return tf

def compute_idf(docs, vocab):
    N = len(docs)
    idf = []
    for word in vocab:
        containing_docs = sum(1 for doc in docs if word in doc)
        idf.append(np.log((N + 1) / (containing_docs + 1)) + 1)
    return idf

idf = compute_idf(df['tokens'], vocab)
df['tfidf'] = df['tokens'].apply(lambda x: np.multiply(compute_tf(x, vocab), idf))


In [15]:
tfidf_matrix = np.array(df['tfidf'].tolist())
tfidf_df = pd.DataFrame(tfidf_matrix, columns=vocab)
tfidf_df['category'] = df['category']

def top_words_per_category(category, n=10):
    subset = tfidf_df[tfidf_df['category'] == category]
    mean_scores = subset.drop('category', axis=1).mean().sort_values(ascending=False)
    return mean_scores.head(n)

print(top_words_per_category('business'))


bn         0.024225
said       0.020736
us         0.016451
growth     0.013686
bank       0.013430
company    0.013115
year       0.012602
economy    0.012560
market     0.012559
sales      0.012361
dtype: float64


In [16]:
avg_tf = np.mean([compute_tf(doc, vocab) for doc in df['tokens']], axis=0)
avg_idf = np.array(idf)

tf_idf_compare = pd.DataFrame({'word': vocab, 'avg_tf': avg_tf, 'idf': avg_idf})

print("High TF but Low IDF:")
print(tf_idf_compare.sort_values(by=['avg_tf', 'idf'], ascending=[False, True]).head(10))

print("\nHigh IDF but Low TF:")
print(tf_idf_compare.sort_values(by=['idf', 'avg_tf'], ascending=[False, True]).head(10))


High TF but Low IDF:
         word    avg_tf       idf
23260    said  0.015731  1.164159
17528      mr  0.005589  2.034664
29860   would  0.005068  1.663056
821      also  0.004630  1.564344
30017    year  0.004373  1.799207
28567      us  0.004329  1.980530
18005     new  0.004015  1.821430
18693     one  0.003569  1.775514
19689  people  0.003560  2.025853
15043    last  0.003285  1.895616

High IDF but Low TF:
               word        avg_tf       idf
8               aan  2.050357e-07  8.014814
710      alienation  2.050357e-07  8.014814
2162       barbaric  2.050357e-07  8.014814
2874       blockade  2.050357e-07  8.014814
4239      ceausescu  2.050357e-07  8.014814
4435    chargetrial  2.050357e-07  8.014814
5111      collusion  2.050357e-07  8.014814
5280     competence  2.050357e-07  8.014814
6075       craftily  2.050357e-07  8.014814
6957  denunciations  2.050357e-07  8.014814
