## 1.	Data Preprocessing

In [110]:
import pandas as pd

data = pd.read_csv(r"C:\Users\MSI GF66\PycharmProjects\NLP\BBC_text\bbc-text.csv")

print(data.head())

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


In [111]:
import nltk
import re

In [112]:
def remove_special_characters(t):
    if isinstance(t, str):
        t = re.sub(r"[.,!@?#$%&*()+=\-_{}\[\];:'\"/\\|<>`~]", "", t)
        return t
    return t

In [113]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def remove_stop_words(t):
    if isinstance(t, str):
        tokens = t.split()
        new = [token for token in tokens if token not in stopwords]
        return " ".join(new)
    return t

[nltk_data] Downloading package stopwords to C:\Users\MSI
[nltk_data]     GF66\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
def data_cleaning(dataset):
    dataset = dataset["text"].str.lower()
    dataset = dataset.apply(remove_special_characters)
    dataset = dataset.apply(remove_stop_words)
    return dataset

In [115]:
data = data_cleaning(data)

In [116]:
print(data.head())

0    tv future hands viewers home theatre systems p...
1    worldcom boss left books alone former worldcom...
2    tigers wary farrell gamble leicester say rushe...
3    yeading face newcastle fa cup premiership side...
4    ocean twelve raids box office ocean twelve cri...
Name: text, dtype: object


In [117]:
def tokenize(t):
    if isinstance(t, str):
        return t.split()
    return []

In [118]:
data_tokens = data.apply(tokenize)


In [119]:
print(data_tokens[4])

['ocean', 'twelve', 'raids', 'box', 'office', 'ocean', 'twelve', 'crime', 'caper', 'sequel', 'starring', 'george', 'clooney', 'brad', 'pitt', 'julia', 'roberts', 'gone', 'straight', 'number', 'one', 'us', 'box', 'office', 'chart', 'took', '408m', '£21m', 'weekend', 'ticket', 'sales', 'according', 'studio', 'estimates', 'sequel', 'follows', 'master', 'criminals', 'try', 'pull', 'three', 'major', 'heists', 'across', 'europe', 'knocked', 'last', 'week', 'number', 'one', 'national', 'treasure', 'third', 'place', 'wesley', 'snipes', 'blade', 'trinity', 'second', 'taking', '161m', '£84m', 'rounding', 'top', 'five', 'animated', 'fable', 'polar', 'express', 'starring', 'tom', 'hanks', 'festive', 'comedy', 'christmas', 'kranks', 'ocean', 'twelve', 'box', 'office', 'triumph', 'marks', 'fourthbiggest', 'opening', 'december', 'release', 'us', 'three', 'films', 'lord', 'rings', 'trilogy', 'sequel', 'narrowly', 'beat', '2001', 'predecessor', 'ocean', 'eleven', 'took', '381m', '£198m', 'opening', 'we

## 2.	Implement Bag of Words

In [120]:
def build_vocab(tokenized_data):

    vocab = set()
    for tokens in tokenized_data:
        for token in tokens:
            if token.isalpha():
                vocab.add(token)
    return sorted(list(vocab))

In [121]:
def document_into_bow(document_tokens, vocabulary):
    bow = [0 for i in range(len(vocabulary))]
    word_tokens = {word: i for i, word in enumerate(vocabulary)}

    for token in document_tokens:
        if token in word_tokens:
            bow[word_tokens[token]] += 1
    return bow

In [122]:
m = int(len(data_tokens) * 0.8)
train, test = data_tokens[:m], data_tokens[m:]

vocabulary = build_vocab(train)

training_bow = []
for doc in train:
    training_bow.append(document_into_bow(doc, vocabulary))

test_bow = []
for doc in test:
    test_bow.append(document_into_bow(doc, vocabulary))

print("Vocabulary size: ", len(vocabulary))

Vocabulary size:  27554


## 3.	Implement TF-IDF

In [123]:
def compute_tf(document_tokens, vocabulary):
    tf_vector = []
    document_len = len(document_tokens)

    word_counts = {}
    for word in document_tokens:
        word_counts[word] = word_counts.get(word, 0) + 1

    for word in vocabulary:
        count = word_counts.get(word, 0)
        tf = count / document_len if document_len > 0 else 0
        tf_vector.append(tf)

    return tf_vector

In [124]:
import math

def compute_idf(all_docs, vocabulary):
    N = len(all_docs)
    idf_vector = []

    for word in vocabulary:
        df = 0
        for doc in all_docs:
            if word in doc:
                df += 1
        idf = math.log(N / (df + 1)) + 1
        idf_vector.append(idf)

    return idf_vector

In [125]:
def compute_tfidf(doc_tokens, vocabulary, idf_vector):
    tf_vector = compute_tf(doc_tokens, vocabulary)
    tf_idf_vector = []

    for tf, idf in zip(tf_vector, idf_vector):
        tf_idf_vector.append(tf * idf)
    return tf_idf_vector

In [126]:
idf_vector = compute_idf(data_tokens, vocabulary)

In [127]:
train_tf_idf = []
for doc in train:
    train_tf_idf.append(compute_tfidf(doc, vocabulary, idf_vector))

test_tf_idf = []
for doc in test:
    test_tf_idf.append(compute_tfidf(doc, vocabulary, idf_vector))


In [128]:
print("Vocabulary size:", len(vocabulary))
print("First 10 words:", vocabulary[:30])
print("First training doc TF-IDF vector sample:", train_tf_idf[0][:50], "...")

Vocabulary size: 27554
First 10 words: ['aa', 'aaa', 'aaas', 'aac', 'aaliyah', 'aaltra', 'aamir', 'aan', 'aara', 'aarhus', 'aaron', 'abacus', 'abandon', 'abandoned', 'abandoning', 'abandonment', 'abate', 'abating', 'abba', 'abbas', 'abbasi', 'abbey', 'abbot', 'abbott', 'abbreviated', 'abc', 'abd', 'abdellatif', 'abdication', 'abdominal']
First training doc TF-IDF vector sample: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.018558292453493247, 0.0, 0.0, 0.0, 0.0, 0.0] ...


## 4.	Analysis


In [129]:
import numpy as np

def top_words_by_category(tfidf_vector, labels, vocabulary, top_n=10):
    categories = set(labels)
    results = {}

    for category in categories:
        indices = [i for i, label in enumerate(labels) if label == category]

        category_vectors = [tfidf_vector[i] for i in indices]

        avg_vector = np.mean(category_vectors, axis=0)

        top_indices = np.argsort(avg_vector)[-top_n:][::-1]
        top_words = [(vocabulary[i], avg_vector[i]) for i in top_indices]

        results[category] = top_words

    return results

In [130]:
data = pd.read_csv(r"C:\Users\MSI GF66\PycharmProjects\NLP\BBC_text\bbc-text.csv")

train_labels = list(data['category'][:len(train_tf_idf)])
category_top_words = top_words_by_category(train_tf_idf, train_labels, vocabulary, top_n=10)

for category, words in category_top_words.items():
    print(f"\nCategory {category}:")
    for word, score in words:
        print(f"{word}: {score:.4f}")


Category business:
said: 0.0192
us: 0.0153
growth: 0.0130
company: 0.0128
economy: 0.0126
bank: 0.0122
year: 0.0120
mr: 0.0116
market: 0.0114
oil: 0.0113

Category entertainment:
film: 0.0348
best: 0.0196
show: 0.0151
music: 0.0144
said: 0.0141
awards: 0.0126
festival: 0.0126
band: 0.0122
award: 0.0119
star: 0.0115

Category politics:
mr: 0.0318
said: 0.0252
labour: 0.0244
election: 0.0195
party: 0.0195
blair: 0.0178
government: 0.0170
would: 0.0167
brown: 0.0136
minister: 0.0120

Category sport:
england: 0.0145
game: 0.0144
said: 0.0137
win: 0.0132
cup: 0.0119
chelsea: 0.0113
match: 0.0112
club: 0.0107
play: 0.0104
injury: 0.0104

Category tech:
said: 0.0165
people: 0.0161
users: 0.0145
software: 0.0137
microsoft: 0.0128
technology: 0.0125
net: 0.0123
mobile: 0.0120
broadband: 0.0118
digital: 0.0113


In [131]:
tf_vector = compute_tf(data_tokens[0], vocabulary)
idf_vector_np = np.array(idf_vector)

tf_np = np.array(tf_vector)

N = 750

top_tf = np.argsort(tf_np)[-N:][::-1]
top_idf = np.argsort(idf_vector_np)[-N:][::-1]

bottom_tf = np.argsort(tf_np)[:N]
bottom_idf = np.argsort(idf_vector_np)[:N]

high_tf_low_idf = [i for i in top_tf if i in bottom_idf]
low_tf_high_idf = [i for i in top_idf if i in bottom_tf]

high_tf_low_idf_words = [vocabulary[i] for i in high_tf_low_idf]
low_tf_high_idf_words = [vocabulary[i] for i in low_tf_high_idf]

print("High TF, Low IDF: ", high_tf_low_idf_words)
print("Low TF, High IDF: ", low_tf_high_idf_words)

High TF, Low IDF:  ['tv', 'people', 'want', 'us', 'content', 'means', 'one', 'new', 'said', 'show', 'bbc', 'launched', 'much', 'companies', 'uk', 'also', 'play', 'like', 'forward', 'choice', 'instead', 'everyone', 'way', 'might', 'consumer', 'future', 'find', 'years', 'technology', 'network', 'video', 'many', 'home', 'europe', 'take', 'mr', 'moment', 'digital', 'time', 'terms', 'used', 'leading', 'website', 'set', 'group', 'everything', 'experience', 'firm', 'even', 'biggest', 'mobile', 'record', 'taking', 'big', 'know', 'news', 'different', 'added', 'although', 'possible', 'growing', 'japan', 'bill', 'system', 'months', 'could', 'business', 'hard', 'help', 'programme', 'work', 'issue', 'issues', 'impact', 'available', 'well', 'today', 'together', 'senior', 'see', 'put', 'announced', 'example', 'hours', 'five', 'told', 'annual', 'called', 'end', 'rather', 'getting', 'important', 'yet', 'personal', 'already', 'suggested', 'lost', 'market', 'service', 'services', 'according', 'control', 