In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import math
from collections import Counter

with open('dictionary.txt', 'r') as file:
    dictionary = file.read().splitlines()

df = pd.read_csv('24_train_2.csv',encoding='unicode_escape')

# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def get_tokens(text):
  # turn document into lowercase
  lowers = text.lower()
  # remove punctuations
  no_punctuation = lowers.translate(remove_punctuation_map)
  # tokenize document
  tokens = nltk.word_tokenize(no_punctuation)
  # remove stop words
  filtered = [w for w in tokens if not w in stopwords.words('english')]
  # stemming process
  stemmed = []
  for item in filtered:
      stemmed.append(stemmer.stem(item))
  # final unigrams
  unigrams = [word for word in stemmed if word in dictionary]
  return unigrams

#calling unigrams function
df['Unigrams'] = df['Text'].apply(get_tokens)

#calculating tf
tf_matrix = np.zeros((len(df), len(dictionary)), dtype=float)

for i, unigrams in enumerate(df['Unigrams']):
    max_freq = 0
    word_count = {}
    for word in unigrams:
        word_count[word] = word_count.get(word, 0) + 1
        max_freq = max(max_freq, word_count[word])
    
    for word in word_count:
        if word in dictionary:
            j = dictionary.index(word)
            tf_matrix[i, j] = word_count[word] / max_freq

#calculating idf
idf_vector = np.zeros(len(dictionary), dtype=float)
n_documents = len(df)

for j, word in enumerate(dictionary):
    doc_count = sum(tf_matrix[:, j] > 0)
    if doc_count > 0:
        idf_vector[j] = math.log(n_documents / doc_count)

#calculating tfidf
tfidf_matrix = tf_matrix * idf_vector
tfidf_matrix=np.round(tfidf_matrix,4)

output_file = 'matrix.txt'

# Save the matrix to the file
with open(output_file, 'w') as f:
    for row in tfidf_matrix:
        # Convert the row to a comma-separated string
        row_string = ','.join([f"{score:.4f}" for score in row])
        # Write the string to the file
        f.write(row_string + '\n')

#sum tfidf
category_tfidf_sum = {category: np.zeros(len(dictionary)) for category in df['Category'].unique()}
category_word_count = {category: Counter() for category in df['Category'].unique()}
category_doc_count = {category: 0 for category in df['Category'].unique()}

for i, (unigrams, category) in enumerate(zip(df['Unigrams'], df['Category'])):
    category_tfidf_sum[category] += tfidf_matrix[i]
    category_word_count[category].update(unigrams)
    category_doc_count[category] += 1

#calculate average tfidf
category_avg_tfidf = {}
for category in category_tfidf_sum:
    category_avg_tfidf[category] = category_tfidf_sum[category] / category_doc_count[category]

#top 3 highest average tfidf
top_words_by_tfidf = {}
for category, avg_tfidf in category_avg_tfidf.items():
    top_3_indices = np.argsort(avg_tfidf)[-3:][::-1]
    top_words = [(dictionary[i], avg_tfidf[i]) for i in top_3_indices]
    top_words_by_tfidf[category] = top_words

#top 3 most frequent words in each category
top_words_by_frequency = {}
for category, word_count in category_word_count.items():
    top_words = word_count.most_common(3)
    top_words_by_frequency[category] = top_words

#results
print("\n3 highest Average tfidf words in each category:")
for category, top_words in top_words_by_tfidf.items():
    print(f"\nCategory: {category}")
    for word, avg_tfidf in top_words:
        print(f"  {word}: {avg_tfidf:.4f}")

print("\n3 most frequent words in each category:")
for category, top_words in top_words_by_frequency.items():
    print(f"\nCategory: {category}")
    for word, freq in top_words:
        print(f"  {word}: {freq}")
