#Install required packages and nltk packages


In [10]:
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import *
import math
from collections import Counter
import json

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the dataset and Load the predefined dictionary


In [11]:
# Load the dataset
df = pd.read_csv('24_train_1.csv')  # Replace '24_train_1.csv' with the actual file path

# Stemming tool from nltk
stemmer = PorterStemmer()
# A mapping dictionary that helps remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


# Function to preprocess text


In [12]:
def preprocess_text(text):
    # Turn document into lowercase
    lowers = text.lower()
    # Remove punctuations
    no_punctuation = lowers.translate(remove_punctuation_map)
    # Tokenize document
    tokens = nltk.word_tokenize(no_punctuation)
    # Remove stop words
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    # Stemming process
    stemmed = [stemmer.stem(item) for item in filtered]
    # Final unigrams
    return stemmed

In [13]:

# Apply text preprocessing to each document in the 'Text' column
df['Processed_Text'] = df['Text'].apply(preprocess_text)

print(df)

     ArticleId                                               Text  \
0         1429  sfa awaits report over mikoliunas the scottish...   
1         1896  parmalat to return to stockmarket parmalat  th...   
2         1633  edu blasts arsenal arsenal s brazilian midfiel...   
3         2178  henman decides to quit davis cup tim henman ha...   
4          194  french suitor holds lse meeting european stock...   
..         ...                                                ...   
995       1250  blair  damaged  by blunkett row a majority of ...   
996       1639  a november to remember last saturday  one news...   
997        916  highbury tunnel players in clear the football ...   
998       2217  top stars join us tsunami tv show brad pitt  r...   
999        902  eastwood s baby scoops top oscars clint eastwo...   

          Category                                     Processed_Text  
0            sport  [sfa, await, report, mikoliuna, scottish, foot...  
1         business  [parmal

# Generate unigrams for each document

In [14]:
# Load the dictionary
with open('dictionary.txt', 'r') as file:
    dictionary = set(line.strip() for line in file)

# Function to filter unigrams based on the dictionary
def filter_unigrams(unigrams):
    return [word for word in unigrams if word in dictionary]

# Apply the filtering function to each row in the 'Processed_Text' column
df['Filtered_Unigrams'] = df['Processed_Text'].apply(filter_unigrams)

# Now, df['Filtered_Unigrams'] contains the filtered unigrams for each document
print(df['Filtered_Unigrams'])



0      [report, scottish, footbal, associ, refere, re...
1      [return, compani, went, account, hope, back, s...
2      [arsen, arsen, hit, club, offer, new, contract...
3      [decid, quit, davi, cup, great, britain, davi,...
4      [french, hold, meet, european, stock, market, ...
                             ...                        
995    [blair, damag, row, major, voter, believ, prim...
996    [novemb, last, saturday, one, newspap, england...
997    [player, clear, footbal, associ, said, bring, ...
998    [top, star, join, us, tsunami, tv, show, rober...
999    [top, oscar, million, dollar, beat, martin, av...
Name: Filtered_Unigrams, Length: 1000, dtype: object


In [15]:
# Create a new DataFrame with relevant columns
new_df = df[['ArticleId', 'Filtered_Unigrams', 'Category']]

# Save the new DataFrame as a CSV file
new_df.to_csv('processed_data.csv', index=False)


#Task 2 TFIDF Matrix Calculation

In [16]:
# Function to calculate Term Frequency (TF) for each document
def calculate_tf(document):
    word_count = Counter(document)
    max_occurrences = max(word_count.values())
    tf = {word: count / max_occurrences for word, count in word_count.items()}
    return tf

# Function to calculate Inverse Document Frequency (IDF) for each word
def calculate_idf(documents):
    document_count = len(documents)
    idf = {word: math.log(document_count / sum(1 for doc in documents if word in doc))
           for word in set(word for doc in documents for word in doc)}
    return idf

# Function to calculate TFIDF for each document and word
def calculate_tfidf(documents, idf):
    tfidf_matrix = []
    for document in documents:
        tf = calculate_tf(document)
        tfidf = {word: tf[word] * idf[word] for word in tf.keys()}
        tfidf_matrix.append(tfidf)
    return tfidf_matrix

# Apply TFIDF calculation to the 'Filtered_Unigrams' column
idf_values = calculate_idf(df['Filtered_Unigrams'])
tfidf_matrix = calculate_tfidf(df['Filtered_Unigrams'], idf_values)

with open('matrix.txt', 'w', encoding='utf-8') as matrix_file:
    for tfidf_scores in tfidf_matrix:
        line = ','.join(str(tfidf_scores.get(word, 0)) for word in dictionary)
        matrix_file.write(line + '\n')

#Compute Top 3 Most Frequent Words:

In [17]:
# Function to compute the top 3 most frequent words for each category
def top_most_frequent_words_per_category(df):
    top_frequency = {}
    for category in df['Category'].unique():
        category_df = df[df['Category'] == category]
        all_words = [word for sublist in category_df['Filtered_Unigrams'] for word in sublist]
        word_counts = Counter(all_words)
        top_frequency[category] = dict(word_counts.most_common(3))

    # Save results to 'frequency.json'
    with open('frequency.json', 'w') as frequency_file:
        json.dump(top_frequency, frequency_file, indent=4)

# Call the function to calculate and save the top 3 most frequent words
top_most_frequent_words_per_category(df)

#Compute Top 3 Highest Average TFIDF Words:


In [18]:
# Function to compute the top 3 highest average TFIDF words for each category
def top_average_tfidf_per_category(df, tfidf_matrix):
    top_tfidf = {}
    for category in df['Category'].unique():
        category_df = df[df['Category'] == category]
        category_tfidf_matrix = [tfidf_matrix[i] for i in category_df.index]

        # Filter out words that are not present in any document in the category
        unique_words = set(word for doc in category_tfidf_matrix for word in doc.keys())

        average_tfidf = {}
        for word in unique_words:
            total_tfidf = sum(doc.get(word, 0) for doc in category_tfidf_matrix)
            average_tfidf[word] = total_tfidf / len(category_tfidf_matrix)

        top_tfidf[category] = dict(sorted(average_tfidf.items(), key=lambda x: x[1], reverse=True)[:3])

    # Save results to 'scores.json'
    with open('scores.json', 'w') as scores_file:
        json.dump(top_tfidf, scores_file, indent=4)

# Call the function to calculate and save the top 3 highest average TFIDF words
top_average_tfidf_per_category(df, tfidf_matrix)
