In [2]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize

# Loading Dataset

In [86]:
df = pd.read_csv('bbc_text_cls.csv')

df_sample = df.iloc[:]

df_sample

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business
...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,tech
2222,Be careful how you code\n\nA new European dire...,tech
2223,US cyber security chief resigns\n\nThe man mak...,tech


# Creating Vocabulary Of All Words

In [39]:
nltk.download('punkt')  # Download the necessary dataset

vocab = set()  # Use a set instead of a Series

for row in df_sample['text']:
    tokens_set = word_tokenize(row.lower())
    vocab.update(tokens_set)  # Update the set with unique tokens from each row

vocab = pd.Series(list(vocab))# Convert the set back to a Series if necessary

vocab

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0             risen
1             1.8bn
2          rejected
3         greenback
4            target
           ...     
1140    contraction
1141         report
1142          shift
1143           curb
1144             by
Length: 1145, dtype: object

In [None]:
# Old Code For Making Occurrence DataFrame

# # Create a new DataFrame with columns reindexed from the word_series
# occurrence_df_1 = pd.DataFrame(columns=vocab)
# 
# # Iterate through each document and count occurrences of each word
# for column_name in vocab:
#     occurrence_df_1[column_name] = df_sample['text'].apply(lambda row: row.count(column_name))

# Converting Each Word In Documents Into Corresponding Int Number 

In [87]:
# Initialize the index counter for assigning unique integer IDs
idx = 0

# Create an empty dictionary to map words to their corresponding integer IDs
word2idx = {}

# Create an empty list to store tokenized representations of documents
tokenized_docs = []

# Loop through each document in the 'text' column of the DataFrame
for doc in df_sample['text']:
    # Tokenize the document by breaking it into lowercase words
    words = word_tokenize(doc.lower())
    
    # Create an empty list to store integer IDs for words in the document
    doc_as_int = []
    
    # Loop through each word in the tokenized words of the document
    for word in words:
        # Check if the word is not already in the word2idx dictionary
        if word not in word2idx:
            # Assign the current index value to the word and increment the index
            word2idx[word] = idx
            idx += 1
        
        # Append the integer ID of the word to the doc_as_int list
        doc_as_int.append(word2idx[word])
    
    # Append the list of integer IDs for the document to tokenized_docs
    tokenized_docs.append(doc_as_int)

list(word2idx.items())[:10]

[('ad', 0),
 ('sales', 1),
 ('boost', 2),
 ('time', 3),
 ('warner', 4),
 ('profit', 5),
 ('quarterly', 6),
 ('profits', 7),
 ('at', 8),
 ('us', 9)]

# Making Reverse idx2word Map

In [88]:
idx2word = {v:k for k,v in word2idx.items()}

list(idx2word.items())[:10]

[(0, 'ad'),
 (1, 'sales'),
 (2, 'boost'),
 (3, 'time'),
 (4, 'warner'),
 (5, 'profit'),
 (6, 'quarterly'),
 (7, 'profits'),
 (8, 'at'),
 (9, 'us')]

# Generating TF-IDF Matrix from Tokenized Documents 

In [89]:
# Calculate the number of rows in the matrix, which corresponds to the number of documents
N = len(df_sample['text'])

# Calculate the number of columns in the matrix, which corresponds to the vocabulary size
V = len(word2idx)   

# Initialize a Term Frequency-Inverse Document Frequency (TF-IDF) matrix with zeros
tf = np.zeros((N, V))

# Iterate through the tokenized documents and update the TF-IDF matrix
for i, doc_as_int in enumerate(tokenized_docs):
    # For each word index in the current document, increment the corresponding cell in the TF-IDF matrix
    for j in doc_as_int:
        tf[i, j] += 1

In [90]:
tf

array([[1., 4., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 1.]])

# Calculating TF-IDF Weights for Document-Term Matrix

In [91]:
# Calculate the document frequency for each word in the vocabulary
document_freq = np.sum(tf > 0, axis=0)

# Calculate the inverse document frequency (IDF) for each word
idf = np.log(N / document_freq)

# Compute the TF-IDF matrix by element-wise multiplication of TF and IDF
tf_idf = tf * idf

In [92]:
tf_idf

array([[5.22260554, 9.5575688 , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 2.86332511, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 7.70751219, 7.70751219,
        7.70751219]])

In [0]:
# Set the random seed for reproducibility
np.random.seed(123)

In [99]:
# Choose a random document index using the random seed
i = np.random.choice(N)

# Retrieve the selected row from the DataFrame
row = df_sample.iloc[i]

# Print the label of the selected document
print('Label: ', row['labels'])

# Print the first line of text from the selected document
print('Text: ', row['text'].split('\n', 1)[0])

# Print the top 5 terms for the selected document
print('\nTop 5 terms: ')

# Get the TF-IDF scores for the selected document
scores = tf_idf[i]

# Get the indices of terms in descending order of their scores
indices = (-scores).argsort()

# Loop through the top 5 term indices and print their corresponding words
for j in indices[:5]:
    print(idx2word[j])

Label:  sport
Text:  Hingis hints at playing comeback

Top 5 terms: 
hingis
pattaya
thailand
95th
30th
