In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [2]:
# The 'punkt' package is required for tokenizing text into words and sentences.
# By downloading 'punkt', we ensure that we have the necessary resources to perform tokenization.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tiwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# adding 'r' means that backslashes (\) in the string are treated as literal backslashes and not as escape characters.
df = pd.read_csv(r"C:\Users\tiwar\Downloads\bbc_text_cls.csv")

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
#converting documents into sequences of indices basically (index).
idx = 0
word2idx = {}  #{'ad': 0, 'sales': 1, 'boost': 2, 'time': 3, ..........
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [6]:
# reverse mapping
idx2word = {v:k for k,v in word2idx.items()}

In [7]:
# number of documents
N = len(df['text'])
print(N)

2225


In [8]:
# number of words
V = len(word2idx)
print(V)

34762


In [9]:
# create a N X V matrix where all the cell is filled with zeros
tf = np.zeros((N,V))

In [10]:
# count the occurrences of each token in each document, thereby creating a term frequency representation (vector) of the documents.
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i,j] += 1
print(tf)

[[1. 4. 1. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]]


In [11]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0) # [ 12 204 127 ...  (sum of the elements of each column in tf matrix.)
idf = np.log(N/document_freq)
print(document_freq)
print(idf)

[ 12 204 127 ...   1   1   1]
[5.22260554 2.3893922  2.86332511 ... 7.70751219 7.70751219 7.70751219]


In [12]:
#compute TF-IDF
tf_idf = tf*idf
print(tf_idf)

[[5.22260554 9.5575688  2.86332511 ... 0.         0.         0.        ]
 [0.         0.         2.86332511 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 7.70751219 7.70751219 7.70751219]]


In [13]:
# np.random.seed() - it ensures that the random numbers generated are the same every time you run the code. This is important for reproducibility, so you or others can get consistent results and debug effectively. Without it, you'll get different random numbers on each run, making it hard to replicate results.
np.random.seed(123)

In [15]:
# Pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i] #This will return one row to df
print("Label:", row["labels"])
print("Text:", row["text"].split("\n",1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
print(scores)
# argsort() sorts in ascending order by default and provides the indices of the terms sorted by their importance
indices = (-scores).argsort()
print(indices)

for j in indices[:5]:
    print(idx2word[j])

Label: politics
Text: Clarke faces ID cards rebellion
Top 5 terms:
[0. 0. 0. ... 0. 0. 0.]
[ 1931  1592 23370 ... 11738 11752 34761]
cards
clarke
rebellion
id
bill
