In [150]:
import json
import math as m

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [151]:
#get words after filtering stop words
def get_filtered_words(headline):
    return [word for word in word_tokenize(headline) if not word in stop_words and not word.isdigit() and word != ',']

# Build a vocab list from train dataset
def build_vocab_list(train_dataset):
    vocab_list = []
    for record in train_dataset:
        filtered_words = get_filtered_words(record['headline'])
        vocab_list.extend(filtered_words)
    return vocab_list

In [152]:
#create count frequency of words on training data
def build_doc_freq_dict(train_dataset,categories):
    word_frequency_dict = {category:{vocab:0 for vocab in vocab_list} for category in categories}
    for record in train_dataset:
        for word in get_filtered_words(record['headline']):
            word_frequency_dict[record['category']][word]+=1
    return word_frequency_dict


In [153]:
# seperate dataset by class
def get_data_group_by_classes(dataset):
    seperated_category_dict = {category:[] for category in categories}
    for record in dataset:
        seperated_category_dict[record['category']].append(record)
    return seperated_category_dict

In [154]:
def get_record_counts_per_category(separated_category_dict):
    record_counts = {}
    for category, records in separated_category_dict.items():
        record_counts[category] = len(records)
    return record_counts

In [155]:
#to acces to word count in respective category
def get_word_count_in_category(word_counts_per_category, desired_category, desired_word):
    if desired_category in word_counts_per_category:
        category_word_counts = word_counts_per_category[desired_category]
        if desired_word in category_word_counts:
            return category_word_counts[desired_word]
    return 0

In [156]:
def get_words_counts_per_category(separated_category_dict):
    word_counts_per_category = {}

  # Iterate through each category and its records
    for category, records in separated_category_dict.items():
        word_counts = {}
        for i, record in enumerate(records):
            # Get filtered words from the headline for the current record
            filtered_words = get_filtered_words(record['headline'])

            #removing duplicate entries in filtered_words
            filtered_words = list(set(filtered_words))

            # Update word counts for this category
            for word in filtered_words:
                if word in word_counts:
                    word_counts[word] += 1
                else:
                    word_counts[word] = 1
        # Add the word counts for this category to the dictionary
        word_counts_per_category[category] = word_counts
    return word_counts_per_category


In [157]:
#checks if smoothing is needed (In case of Multinomial)
def is_smoothing_needed(filterd_words,word_dict,category):
    for word in filterd_words:
        if word not in word_dict[category].keys():
            return True
    return False

# get probability of the text assuming conditional independence(Multinomail naive bayes)
def get_naive_probability(category,text,word_dict,grouped_classes_dict,dataset):
    if category not in word_dict.keys():
        return
    total_word_cond_prob = 1
    filterd_words = get_filtered_words(text)
    smoothing_indicator = is_smoothing_needed(filterd_words,word_dict,category)
    for word in filterd_words:
        if smoothing_indicator :
            word_doc_freq = word_dict[category][word] if word in word_dict[category].keys() else 0
            total_word_cond_prob *= ((word_doc_freq+1)/((len(grouped_classes_dict[category])+(len(vocab_list))))) # Smoothing technique
        else:
            total_word_cond_prob *= (word_dict[category][word]/len(grouped_classes_dict[category]))
    class_probability = (len(grouped_classes_dict[category])/len(dataset))
    return total_word_cond_prob * class_probability

# get probability of the text assuming conditional independence(Multivariate naive bayes)
def get_naive_probability_multivariate(category,text,word_dict,grouped_classes_dict,word_counts_per_category,record_counts,dataset):
    if category not in word_dict.keys():
        return
    total_word_cond_prob = 1
    filterd_words = get_filtered_words(text)
    for word in filterd_words:
            word_occurence = get_word_count_in_category(word_counts_per_category, category, word)
            total_doucument_category = record_counts[category]
            total_word_cond_prob *= ((word_occurence+1)/((total_doucument_category+2))) # Smoothing
    class_probability = (len(grouped_classes_dict[category])/len(dataset))
    return total_word_cond_prob * class_probability


In [158]:
#get the predicted class given text belong to (Multinomial)
def get_predicted_class(categories,str_to_predict,word_frequency_dict,group_classes_dict,train_dataset):
    greatest_prob = 0
    predicted_category = ''
    for category in categories:
        total_word_cond_prob = 1
        prob_of_category_contains_headline =  get_naive_probability(category,str_to_predict,word_frequency_dict,group_classes_dict,train_dataset)
        if prob_of_category_contains_headline > greatest_prob:
            greatest_prob = prob_of_category_contains_headline
            predicted_category = category
        #print(f'''prob_of_category_contains_headline {category} is {prob_of_category_contains_headline}''')
    return predicted_category

#get the predicted class given text belong to (Multivariate)
def get_predicted_class_multivariate(categories,str_to_predict,word_frequency_dict,group_classes_dict,word_counts_per_category,record_counts,train_dataset):
    greatest_prob = 0
    predicted_category = ''
    for category in categories:
        total_word_cond_prob = 1
        prob_of_category_contains_headline =  get_naive_probability_multivariate(category,str_to_predict,word_frequency_dict,group_classes_dict,word_counts_per_category,record_counts,train_dataset)
        if prob_of_category_contains_headline > greatest_prob:
            greatest_prob = prob_of_category_contains_headline
            predicted_category = category
        #print(f'''prob_of_category_contains_headline {category} is {prob_of_category_contains_headline}''')
    return predicted_category

In [159]:
 #calculate the accuracy of the classifier over given dataset (Multinomial):
def get_accuracy(dataset,categories,word_frequency_dict,group_classes_dict):

    category_counts = {}
    for record in dataset:
        category = record['category']
        predicted_category = get_predicted_class(categories,record['headline'],word_frequency_dict,group_classes_dict,dataset)
        #print(f"Predicted Category: {predicted_category}")
        if predicted_category == record['category']:
            if category in category_counts:
               category_counts[category] += 1
            else:
              category_counts[category] = 1

    return category_counts

In [160]:
 #calculate the  accuracy of the classifier over given dataset (Multivariate) :
def get_accuracy_multivariate(dataset,categories,word_frequency_dict,group_classes_dict,word_counts_per_category,record_counts):

    category_counts = {}
    for record in dataset:
        category = record['category']
        predicted_category = get_predicted_class_multivariate(categories,record['headline'],word_frequency_dict,group_classes_dict,word_counts_per_category,record_counts,dataset)
        #print(f"Predicted Category: {predicted_category}")
        if predicted_category == record['category']:
            if category in category_counts:
               category_counts[category] += 1
            else:
              category_counts[category] = 1
    return category_counts

In [161]:
# Load the original dataset into dictionary and  data preprocessing.
dataset = []
categories = ["BUSINESS", "COMEDY", "SPORTS", "CRIME", "RELIGION", "HEALTHY LIVING", "POLITICS"]
stop_words = set(stopwords.words('english'))

with open('/content/news_category_dataset.json') as data:
    for line_num, file_text in enumerate(data, start=0):
        record = json.loads(file_text)

        if record['category'] in categories:
            processed_record = {"category": record['category'], "headline": record['headline'].lower()}
            dataset.append(processed_record)
no_of_records = len(dataset)
print(f""" Number of records in the dataset: {no_of_records}""")
print(f""" Number of different categories in the dataset: {len(categories)}""")


 Number of records in the dataset: 57274
 Number of different categories in the dataset: 7


In [162]:
#Divide the dataset into train and test data(test data set 15% and training dataset 85%)
train_dataset,test_dataset = train_test_split(dataset,test_size=0.15)

print(f""" Number of records in the training dataset: {len(train_dataset)}""")
print(f""" Number of records in the test dataset: {len(test_dataset)}""")
print()
#Group all records with their respective category
group_classes_dict = get_data_group_by_classes(train_dataset)
#get no of records in respective category
record_counts = get_record_counts_per_category(group_classes_dict)
#Get count of records containing particular word in respective category (for multivariate)
record_counts_category = get_words_counts_per_category(group_classes_dict)
#vocabulary
vocab_list = build_vocab_list(train_dataset)
#frequency of each word category wise (For Multinomial)
word_frequency_dict = build_doc_freq_dict(train_dataset,categories)

#In test dataset to get no of records in each category (To calculate category wise accuracy)
group_classes_dict_test = get_data_group_by_classes(test_dataset)
record_counts_test = get_record_counts_per_category(group_classes_dict_test)

#list containing category wise no of records  predicted correctly (Multinomial)
test_accuracy = get_accuracy(test_dataset,categories,word_frequency_dict,group_classes_dict)
#list containing category wise no of records  predicted correctly (Multivariate)
test_accuracy_multivariate = get_accuracy_multivariate(test_dataset,categories,word_frequency_dict,group_classes_dict,record_counts_category,record_counts)


overall_accuracy=0;
print(f''' Category Wise Accuracy in Multinomial Naive classifier-->''')
#calculating category wise accuracy and print it (Multinomial)
for category, count in record_counts_test.items():
  if category in test_accuracy_multivariate:
    print(f"Category: {category}, Accuracy: {round((test_accuracy_multivariate[category]/count)*100,2)}%")
    overall_accuracy+=round((test_accuracy_multivariate[category]/count)*100,2)*count
accuracy_multivariate = round(overall_accuracy/len(test_dataset),2)

print(f'''Overall Accuracy of the Multinomial Naive Baye's Classifier over test dataset is {accuracy_multivariate}% \n''')

overall_accuracy=0

print(f''' Category Wise Accuracy in Multivariate Naive classifier-->''')
#calculating category wise accuracy and print it (Multinomial)

for category, count in record_counts_test.items():
  if category in test_accuracy:
    print(f"Category: {category}, Accuracy: {round((test_accuracy[category]/count)*100,2)}%")
    overall_accuracy+=round((test_accuracy[category]/count)*100,2)*count
accuracy_multinomial = round(overall_accuracy/len(test_dataset),2)

print(f'''Overall Accuracy of the Multivariate Naive Baye's Classifier over test dataset is {accuracy_multinomial}%''')


 Number of records in the training dataset: 48682
 Number of records in the test dataset: 8592

 Category Wise Accuracy in Multinomial Naive classifier-->
Category: BUSINESS, Accuracy: 62.42%
Category: COMEDY, Accuracy: 73.15%
Category: SPORTS, Accuracy: 79.28%
Category: CRIME, Accuracy: 91.59%
Category: RELIGION, Accuracy: 72.92%
Category: HEALTHY LIVING, Accuracy: 63.19%
Category: POLITICS, Accuracy: 71.09%
Overall Accuracy of the Multinomial Naive Baye's Classifier over test dataset is 71.43% 

 Category Wise Accuracy in Multivariate Naive classifier-->
Category: BUSINESS, Accuracy: 28.83%
Category: COMEDY, Accuracy: 27.35%
Category: SPORTS, Accuracy: 33.06%
Category: CRIME, Accuracy: 32.3%
Category: RELIGION, Accuracy: 21.98%
Category: HEALTHY LIVING, Accuracy: 46.64%
Category: POLITICS, Accuracy: 89.97%
Overall Accuracy of the Multivariate Naive Baye's Classifier over test dataset is 66.02%
