In [79]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import copy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import contractions
import random
from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix


[nltk_data] Downloading package stopwords to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
class_names = ["comp.graphics", "sci.med", "talk.politics.misc", "rec.sport.hockey", "sci.space"]
class_labels = [0, 1, 2, 3, 4]
class_name_to_label = {class_names[i]:i for i in range(len(class_names))}
class_label_to_name = {i:class_names[i] for i in range(len(class_names))}
data_folder = "./data/20_newsgroups/20_newsgroups"

In [3]:
def splitData(class_names, data_folder, train_ratio):

    class_wise_data = []
    for i in range(len(class_names)):
        class_dir = data_folder + '/' + class_names[i]
        file_names = os.listdir(class_dir)
        n_docs_class = len(file_names)
        shuffled_docs = random.sample(file_names, n_docs_class)
        n_train = int(train_ratio * n_docs_class)
        n_test = n_docs_class - n_train
        train_docs_class = shuffled_docs[:n_train]
        test_docs_class = shuffled_docs[n_train:]
        class_wise_data.append({'train' : train_docs_class, 'test' : test_docs_class})
    return class_wise_data

In [4]:
# class_wise_data = splitData(class_names, data_folder, 0.8)

In [5]:
def check_alnum(tok):
    '''
        Remove non-alphanumeric characters from a string
    '''

    tok = ''.join(ch for ch in tok if ch.isalnum() == True)
    return tok

def remove_punct(tok):
    '''
        Remove the punctuation in token
    '''
    punctuations = string.punctuation
    tok = ''.join(ch for ch in tok if ch not in punctuations)
    return tok

def remove_blank_space(tok):
    '''
        Remove the spaces in token
    '''
    tok = ''.join(ch for ch in tok if ch != ' ')
    return tok
def preprocess(text):

    text = text.lower()

    text = contractions.fix(text)

    all_tokens = word_tokenize(text)

    all_tokens = [check_alnum(tok) for tok in all_tokens]

    stop_words = list(set(stopwords.words('english')))

    all_tokens = [tok for tok in all_tokens if tok not in stop_words]

    toks_no_punct = []
    for tok in all_tokens:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)

    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_blank_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)
    
    final_tokens = [tok for tok in cleaned_toks]

    return final_tokens


In [6]:
def process_data(class_wise_data, class_labels):
    class_wise_train_unique_tokens = {}
    class_wise_tokens = {i: {'train' : [], 'test': []} for i in range(5)}
    class_wise_train_tfs = {}
    for label in class_labels:
        print(f"--- Calculating for label = {label} ---")
        class_train_data = class_wise_data[label]['train']
        class_test_data = class_wise_data[label]['test']
        class_train_tokens = []
        print(">> reading through train files and preprocessing")
        for doc in tqdm(class_train_data):
            f = open(data_folder + '/' + class_label_to_name[label] + '/' + doc, encoding='utf-8', errors='ignore')
            ftxt_unproc = f.read()
            doc_toks = preprocess(ftxt_unproc)
            class_train_tokens.append(doc_toks)
        print("--- Done")
        class_wise_tokens[label]['train'] = class_train_tokens
        class_test_tokens = []
        print(">> reading through files and preprocessing")
        for doc in tqdm(class_test_data):
            f = open(data_folder + '/' + class_label_to_name[label] + '/' + doc, encoding='utf-8', errors='ignore')
            ftxt_unproc = f.read()
            doc_toks = preprocess(ftxt_unproc)
            class_test_tokens.append(doc_toks)
        print("--- Done")
        class_wise_tokens[label]['test'] = class_test_tokens

        # x = np.array(class_train_tokens[0])
        # print(x.shape)
        # all_class_train_tokens = list(np.array(class_train_tokens).flatten())
        print(">> Processing all tokens")
        all_class_train_tokens = []
        for doc_toks in class_train_tokens:
            for tok in doc_toks:
                all_class_train_tokens.append(tok)
        print("--- Done")
        # print(all_class_train_tokens.shape)
        print(">> Calculating class-wise TF")
        class_tfs = dict(Counter(all_class_train_tokens))
        class_wise_train_tfs[label] = class_tfs
        print("--- Done")
        class_wise_train_unique_tokens[label] = list(set(all_class_train_tokens))
        print("\n--------------------------\n")
    return class_wise_tokens, class_wise_train_unique_tokens, class_wise_train_tfs

def compute_icf(class_wise_unq_toks):
    all_class_toks = set()
    for label in class_labels:
        for tok in class_wise_unq_toks[label]:
            all_class_toks.add(tok)
        # all_class_toks.add(class_wise_unq_toks[label])
    term_icf = {}
    num_classes = len(class_labels)
    for tok in all_class_toks:
        present = 0
        for label in class_labels:
            if(tok in class_wise_unq_toks[label]):
                present += 1
        term_icf[tok] = np.log10(num_classes / present)
    return term_icf

In [7]:
# class_wise_tokens, class_wise_train_unique_tokens, class_wise_train_tfs = process_data(class_wise_data, class_labels)

--- Calculating for label = 0 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:05<00:00, 155.65it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 166.85it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 1 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:05<00:00, 148.96it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 133.46it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 2 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:06<00:00, 116.77it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 105.37it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 3 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:05<00:00, 157.81it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 144.25it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 4 ---
>> reading through train files and preprocessing


100%|██████████| 800/800 [00:05<00:00, 149.89it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 200/200 [00:01<00:00, 143.45it/s]

--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------






In [8]:
# term_icfs = compute_icf(class_wise_train_unique_tokens)

In [9]:
def feature_selection(term_icfs, class_wise_train_unique_tokens, class_wise_train_tfs, k):
    class_wise_top_k = {}
    total_feature_set = set()
    for label in class_labels:
        class_unique_toks = class_wise_train_unique_tokens[label]
        class_tfs = class_wise_train_tfs[label]
        tf_icf_score = {}
        for tok in class_unique_toks:
            tf_icf_tok = class_tfs[tok] * term_icfs[tok]
            tf_icf_score[tok] = tf_icf_tok
        sorted_tf_icf = dict(sorted(tf_icf_score.items(), key=lambda item: item[1], reverse=True))
        top_k_class_features = list(sorted_tf_icf.keys())[:k]
        class_wise_top_k[label] = top_k_class_features
        total_feature_set.update(top_k_class_features)
    return class_wise_top_k, total_feature_set
        

In [70]:
# class_wise_top_k, feature_set = feature_selection(term_icfs, class_wise_train_unique_tokens, class_wise_train_tfs, 1)

In [71]:
def featurize_data(class_wise_tokens, feature_set):
    train_x = []
    train_y = []
    test_x = []
    test_y = []

    for label in class_labels:
        
        class_train_doc = class_wise_tokens[label]['train']
        class_test_doc = class_wise_tokens[label]['test']

        for doc_toks in class_train_doc:
            doc_feature = []
            # print(len(doc_toks))
            doc_tfs = dict(Counter(doc_toks))
            # toks_num = 0
            # for k in doc_tfs.keys():
            #     toks_num += doc_tfs[k]
            # print(toks_num)
            # print(doc_tfs)
            # print("-----\n")
            for i, tok in enumerate(feature_set):
                if(tok in doc_tfs.keys()):
                    doc_feature.append(doc_tfs[tok])
                else:
                    doc_feature.append(0)
            train_x.append(doc_feature)
            train_y.append(label)
        
        for doc_toks in class_test_doc:
            doc_feature = []
            doc_tfs = dict(Counter(doc_toks))
            for i, tok in enumerate(feature_set):
                if(tok in doc_tfs.keys()):
                    doc_feature.append(doc_tfs[tok])
                else:
                    doc_feature.append(0)
            test_x.append(doc_feature)
            test_y.append(label)
    
    return train_x, test_x, train_y, test_y
        
        #featurizing class train data


In [72]:
# train_x, test_x, train_y, test_y = featurize_data(class_wise_tokens, feature_set)

In [78]:
def train_naive_bayes(train_x, train_y):
    prior_prob = {}
    conditional_prob = {label:{} for label in class_labels}
    class_feature_cum = {label:{feat:0 for feat in range(len(train_x[0]))} for label in class_labels}
    # print(conditional_prob)
    total_class_samples = len(train_y)
    num_features = len(train_x[0])

    class_wise_count = dict(Counter(train_y))
    for label in class_labels:
        prior_prob[label] = float(class_wise_count[label]) / float(total_class_samples)
    for i in range(total_class_samples):
        sample_label = train_y[i]
        for j in range(num_features):
            class_feature_cum[sample_label][j] += train_x[i][j]
    alpha = 1 #laplace add one smoothing
    for label in class_labels:
        for feature in range(num_features):
            conditional_prob[label][feature] = float(class_feature_cum[label][feature] + alpha ) / float(sum(class_feature_cum[label].values()) + (num_features*alpha))
    
    return prior_prob, conditional_prob

def predict_naive_bayes(test_x, prior_prob, conditional_prob):

    predictions = []
    for sample in test_x:
        posterior_probs = {}
        for label in class_labels:
            probab = np.log10(prior_prob[label])
            for feature in range(len(sample)):
                if(sample[feature] != 0):
                    probab += np.log10(conditional_prob[label][feature])
            posterior_probs[label] = probab
        pred_label = max(posterior_probs, key= lambda x: posterior_probs[x])
        predictions.append(pred_label)
    return predictions

def compute_accuracy(true_y, pred_y):
    correct = 0
    total = len(true_y)
    for i in range(total):
        if(true_y[i] == pred_y[i]):
            correct += 1
    accuracy = float(correct) / float(total)
    return accuracy

def calculate_confusion_matrix(true_y, pred_y):
    conf_matrix = np.zeros((len(class_labels), len(class_labels)))
    for i in range(len(true_y)):
        conf_matrix[true_y[i]][pred_y[i]] += 1
    return conf_matrix

In [74]:
# prior_prob, conditional_prob = train_naive_bayes(train_x, train_y)

In [75]:
# preds = predict_naive_bayes(test_x, prior_prob, conditional_prob)

In [76]:
# print(compute_accuracy(test_y, preds))

0.462


In [90]:
def run_question3():
    train_ratio = float(input("Enter train split ratio [b/w 0 and 1]: "))
    class_wise_data = splitData(class_names, data_folder, train_ratio)
    k = int(input("Enter the value of k : "))
    print(">> Processing Data")
    class_wise_tokens, class_wise_train_unique_tokens, class_wise_train_tfs = process_data(class_wise_data, class_labels)
    print(">> Computing term ICFs")
    term_icfs = compute_icf(class_wise_train_unique_tokens)
    print(">> Performing feature selection")
    class_wise_top_k, feature_set = feature_selection(term_icfs, class_wise_train_unique_tokens, class_wise_train_tfs, k)
    print(">> Featurizing dataset")
    train_x, test_x, train_y, test_y = featurize_data(class_wise_tokens, feature_set)
    print(">> Training Naive Bayes Model")
    prior_prob, conditional_prob = train_naive_bayes(train_x, train_y)
    print(">> Testing Naive Bayes Model")
    predicted_labels = predict_naive_bayes(test_x, prior_prob, conditional_prob)
    accuracy_value = compute_accuracy(test_y, predicted_labels)
    print(f"Accuracy = {accuracy_value * 100}%")
    print(f"SKLEARN ACC = {accuracy_score(test_y, predicted_labels)}")
    conf_matrix = calculate_confusion_matrix(test_y, predicted_labels)
    print(f"Confusion Matrix = {conf_matrix}")
    print(f"SKLEARN CONF MATRIX = {confusion_matrix(test_y, predicted_labels)}")


In [91]:
run_question3()

>> Processing Data
--- Calculating for label = 0 ---
>> reading through train files and preprocessing


100%|██████████| 700/700 [00:04<00:00, 149.73it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 300/300 [00:02<00:00, 141.22it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 1 ---
>> reading through train files and preprocessing


100%|██████████| 700/700 [00:05<00:00, 138.85it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 300/300 [00:02<00:00, 121.49it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 2 ---
>> reading through train files and preprocessing


100%|██████████| 700/700 [00:06<00:00, 101.56it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 300/300 [00:02<00:00, 105.34it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 3 ---
>> reading through train files and preprocessing


100%|██████████| 700/700 [00:04<00:00, 158.57it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 300/300 [00:02<00:00, 122.02it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

--- Calculating for label = 4 ---
>> reading through train files and preprocessing


100%|██████████| 700/700 [00:05<00:00, 136.81it/s]


--- Done
>> reading through files and preprocessing


100%|██████████| 300/300 [00:02<00:00, 146.57it/s]


--- Done
>> Processing all tokens
--- Done
>> Calculating class-wise TF
--- Done

--------------------------

>> Computing term ICFs
>> Performing feature selection
>> Featurizing dataset
>> Training Naive Bayes Model
>> Testing Naive Bayes Model
Accuracy = 99.6%
SKLEARN ACC = 0.996
Confusion Matrix = [[299.   1.   0.   0.   0.]
 [  2. 298.   0.   0.   0.]
 [  0.   0. 298.   0.   2.]
 [  0.   0.   0. 300.   0.]
 [  1.   0.   0.   0. 299.]]
SKLEARN CONF MATRIX = [[299   1   0   0   0]
 [  2 298   0   0   0]
 [  0   0 298   0   2]
 [  0   0   0 300   0]
 [  1   0   0   0 299]]
