In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
from nltk.stem.porter import *
import json
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#SAMPLE CODE FOR GENERATING THE UNIGRAMS

# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def get_tokens(text):
  # turn document into lowercase
  lowers = text.lower()
  # remove punctuations
  no_punctuation = lowers.translate(remove_punctuation_map)
  # tokenize document
  tokens = nltk.word_tokenize(no_punctuation)
  # remove stop words
  filtered = [w for w in tokens if not w in stopwords.words('english')]
  # stemming process
  stemmed = []
  for item in filtered:
    if stemmer.stem(item) in predefined_unigrams:
      stemmed.append(stemmer.stem(item))
  # final unigrams
  return stemmed

In [3]:
dict_path = '/content/drive/MyDrive/dictionary.txt'  # Path for Dictionary file (dictionary.txt)
data_path = '/content/drive/MyDrive/24_train_1.csv'  # Path for Dataset file (24_train_1.csv)

In [4]:
#READING THE CSV FILE
data = pd.read_csv(data_path)
print(data)
output_Doc = data['ArticleId']
# print(output_Doc)

     ArticleId                                               Text  \
0         1429  sfa awaits report over mikoliunas the scottish...   
1         1896  parmalat to return to stockmarket parmalat  th...   
2         1633  edu blasts arsenal arsenal s brazilian midfiel...   
3         2178  henman decides to quit davis cup tim henman ha...   
4          194  french suitor holds lse meeting european stock...   
..         ...                                                ...   
995       1250  blair  damaged  by blunkett row a majority of ...   
996       1639  a november to remember last saturday  one news...   
997        916  highbury tunnel players in clear the football ...   
998       2217  top stars join us tsunami tv show brad pitt  r...   
999        902  eastwood s baby scoops top oscars clint eastwo...   

          Category  
0            sport  
1         business  
2            sport  
3            sport  
4         business  
..             ...  
995       politics  
996

In [5]:
#open the file located in dict_path and read it.
with open(dict_path, 'r') as file:
    predefined_unigrams = set(file.read().splitlines()) #split the lines

with open(dict_path, 'r') as file:
    ordered_words = file.read().splitlines() #split the lines


In [6]:
#take the 'Text' column and pass it to the get_tokens function for generating unigrams for each article
processed_docs = [get_tokens(articles) for articles in data['Text']]
#processed_docs is a list of all the articles in the unigram format

In [7]:
word_dicts = []
for doc in processed_docs:
    word_count = dict(Counter(doc))
    word_dicts.append(word_count)
# print(word_dicts)

In [8]:
# Compute the frequency of each term in each document
def compute_term_frequencies(docs):
    frequencies = []
    for doc in docs:
        # Process the document to get filtered unigrams
        terms = get_tokens(doc)
        frequencies.append(Counter(terms))
    return frequencies

In [9]:
#Function to calculate TFIFD
def compute_tf_idf(docs, dictionary):
    # Initialize matrices
    tf = np.zeros((len(docs), len(dictionary)), dtype=np.float64)
    idf = np.zeros(len(dictionary), dtype=np.float64)

    # Term frequencies
    term_freqs = compute_term_frequencies(docs)

    # Calculate TF and IDF
    for i, doc_freqs in enumerate(term_freqs):
        max_freq = max(doc_freqs.values(), default=1)
        for j, term in enumerate(dictionary):
            tf[i, j] = doc_freqs[term] / max_freq if term in doc_freqs else 0
            if i == 0:
                n_containing_term = sum(term in doc for doc in term_freqs)
                idf[j] = math.log(len(docs) / (n_containing_term))
    print(tf)
    print(idf)
    # TFIDF matrix
    tfidf = tf * idf
    return tfidf

In [10]:
texts = data['Text']

In [11]:
tfidf_matrix = compute_tf_idf(texts, ordered_words)

[[0.2        0.         0.2        ... 0.2        0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.05882353 0.         0.        ]]
[1.44392347 2.61729584 1.80180981 2.65926004 2.60369019 2.24431618
 2.88240359 3.05760768 2.71810054 3.05760768 2.79688141 2.57702194
 2.28278247 2.81341072 2.71810054 2.67364877 2.02495336 1.69281952
 1.98050159 3.07911388 3.01593498 2.67364877 2.90042209 2.76462055
 2.84731227 2.79688141 2.71810054 2.67364877 3.14655516 2.16282315
 2.79688141 2.61729584 1.84516025 2.44184716 2.70306266 2.88240359
 2.3859667  2.37515579 2.11196473 2.93746337 2.68824757 3.41124772
 1.29098418 3.07911388 3.38139475 2.78062089 2.97592965 3.01593498
 2.70306266 2.796881

In [19]:
tfidf_df = pd.DataFrame(tfidf_matrix, columns=list(ordered_words))
top_freq_words = {}
top_tfidf_words = {}

In [20]:
for category in data['Category'].unique():
    category_data = data[data['Category'] == category]
    category_indices = category_data.index
    category_texts = category_data['Text']

    # Computing the most frequent words
    all_words = []
    for text in category_texts:
        all_words.extend(get_tokens(text))
    word_counts = Counter(all_words)
    top_freq_words[category] = word_counts.most_common(3)

    # Computing the highest average TFIDF words

    category_tfidf = tfidf_df.loc[category_indices].mean().sort_values(ascending=False)
    top_tfidf_words_with_scores = category_tfidf.head(3)
    top_tfidf_words[category] = {word: score for word, score in top_tfidf_words_with_scores.items()}


In [21]:
print("Top 3 Most Frequent Words in Each Category:", top_freq_words)
print("Top 3 Highest Average TFIDF Words in Each Category:", top_tfidf_words)

Top 3 Most Frequent Words in Each Category: {'sport': [('said', 428), ('game', 353), ('win', 288)], 'business': [('said', 724), ('us', 377), ('year', 360)], 'tech': [('said', 757), ('use', 459), ('peopl', 427)], 'entertainment': [('film', 450), ('said', 386), ('year', 249)], 'politics': [('said', 996), ('mr', 726), ('would', 495)]}
Top 3 Highest Average TFIDF Words in Each Category: {'sport': {'game': 0.35727414648708805, 'england': 0.31907434737608514, 'win': 0.30741067997068455}, 'business': {'firm': 0.2891252868078186, 'bank': 0.2697288199539767, 'market': 0.2616290834155383}, 'tech': {'mobil': 0.3462714837303001, 'phone': 0.3319065027131584, 'softwar': 0.3152238172837377}, 'entertainment': {'film': 0.7216412939111394, 'award': 0.4106447057087541, 'star': 0.40803563438879187}, 'politics': {'labour': 0.45105036714182084, 'elect': 0.4313731783204545, 'mr': 0.42043597469422206}}


In [22]:
tfidf_df.to_csv('tfidf_matrix.csv', index=False)

In [23]:
#CONVERTING THE TFIDF MATRIX TO A TEXT FILE ('matrix.txt')
matrix_string = "\n".join(",".join(map(str, row)) for row in tfidf_matrix)

file_path = '/content/drive/MyDrive/matrix.txt'

with open(file_path, 'w') as file:
    file.write(matrix_string)

file_path

'/content/drive/MyDrive/matrix.txt'

In [24]:
#CONVERTING THE MAP TO A JSON FILE - TOP 3 FREQUENT WORDS IN EACH CATEGORY (frequenxy.json)

for key in top_freq_words:
    top_freq_words[key] = [list(item) for item in top_freq_words[key]]

adjusted_map = {}
for category, words in top_freq_words.items():
    adjusted_map[category] = {word: freq for word, freq in words}

json_file_path = '/content/drive/MyDrive/frequency.json'

with open(json_file_path, 'w') as json_file:
    json.dump(adjusted_map, json_file, indent=4)

print(f"File saved at {json_file_path}")

File saved at /content/drive/MyDrive/frequency.json


In [25]:
#CONVERTING THE MAP TO A JSON FILE - top 3 highest average TFIDF words by category (scores.json)

json_file_path = '/content/drive/MyDrive/scores.json'

with open(json_file_path, 'w') as json_file:
    json.dump(top_tfidf_words, json_file, indent=4)

print(f"File saved at {json_file_path}")

File saved at /content/drive/MyDrive/scores.json
