In [1]:
from os import listdir
from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def preprocessing(text_file):
    tokenizer = nltk.RegexpTokenizer(r"\w+")    # remove punctuations
    tokens = tokenizer.tokenize(text_file)    # token the text

    tokens = [token.lower() for token in tokens]    # lower case
    tokens = [word for word in tokens if not word in stop_words]    # remove stop words

    stemmer = PorterStemmer()    #stemming tokens
    tokens = [stemmer.stem(word) for word in tokens]
    
    lemmatizer = WordNetLemmatizer()   #lemmatizing tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# get a list of all files in the directory
def getFilesList(folder):
    dir_list = listdir(folder)    #subfolders
    files = {}
    for i in dir_list:
        path = folder+i+"/"
        diri = listdir(path)    #files per subfolder
        files[i] = []
        for j in diri:
            filepath = path+j
            files[i].append(filepath)    #storing full path of file
    return files

def files_read_preprocess(folderpath):
    flist = getFilesList(folderpath)
    alltextfromfiles = []
    textlabels = []
    for folder in flist:
        for file in flist[folder]:
            text = open(file,'r',encoding='ISO-8859-1').read()
            tokens = preprocessing(text)
            alltextfromfiles.append(tokens)
            textlabels.append(folder)    #corresponding label of text

    print(len(alltextfromfiles))
    print(len(textlabels))

    dataset = pd.DataFrame([alltextfromfiles,textlabels]).T
    dataset.to_pickle("q3_preprocessed")

In [3]:
def tficf_feature_select(x_train,top_k):
    corpus = []
    for i in class_dict:
        corpus += class_dict[i]    # all the text from all classes

    CF = {}
    for i in class_dict:
        for word in class_dict[i]:
            if word not in CF:
                CF[word] = {}
            CF[word][i] = 1    # storing all classes where word tokens is present
    #print(len(CF))

    tf_icf = {}
    N = len(uniq_labels)
    counter=Counter(corpus)
    word_count=len(corpus)

    for token in set(corpus):
        tf = counter[token]/word_count    # term frequency
        try:
            cf = len(CF[token])    # class frequency
        except:
            pass
        icf = np.log(N/(cf))    # inverse class frequency
        tf_icf[token]=icf*tf    # TF-ICF for token

    # selecting top k tokens with highest tf-icf values
    sorted_tf_icf = sorted(tf_icf.items(),key=lambda x:x[1],reverse=True)[:top_k]
    return [i[0] for i in sorted_tf_icf]

def getCountFreqTopKFeatures(feature_list,labels):
    class_frequency = {}
    class_count = {}
    counter = 0
    for i in labels:
        current_count = len(Counter(class_dict[i]))
        class_count[i] = current_count
        counter += current_count
        ll = Counter(class_dict[i])
        for j in ll:
            class_frequency[i, j] = ll[j]
    return class_frequency,class_count

def Naive_Bayes(class_frequency,class_count,x_test,y_test):
    actual = []
    predicted = []

    for i in range(x_test.shape[0]):
        classes_word_prob = []
        actual.append(y_test[i])

        for labels in uniq_labels:
            word_prob = 0

            for word in x_test[0][i]:
                t1,t2 = 0,class_count[labels]
                try:
                    t1 = class_frequency[labels,word]
                except: pass
                t3 = (t1+1)/(t2+unique_words_count)
                word_prob += np.log(t3)

            classes_word_prob.append(word_prob)
        predicted.append(uniq_labels[np.argmax(classes_word_prob)])

    return actual,predicted

def Naive_Bayes2(class_frequency,class_count,x_test,y_test):
    

In [4]:
folderpath = "20_newsgroups/"
# dataset = files_read_preprocess(folderpath)
dataset = pd.read_pickle("q3_preprocessed")
# print(dataset)
uniq_labels = listdir(folderpath)

y = dataset.pop(1)
X = dataset
splits = [0.2,0.3,0.5]
features_top_k = 100

for splitsize in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=splitsize, random_state=42)
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    class_dict = {}
    for i in range(X_train.shape[0]):
        if y_train[i] not in class_dict:
            class_dict[y_train[i]] = []
        class_dict[y_train[i]] += X_train[0][i]

    unique_words = set()
    for i in class_dict:
        unique_words = unique_words | set(class_dict[i])
    unique_words_count = len(unique_words)

    sorted_feature_list = tficf_feature_select(X_train,features_top_k)
    # print(sorted_feature_list)
    class_freq,class_count = getCountFreqTopKFeatures(sorted_feature_list,uniq_labels)
    actual,predicted = Naive_Bayes(class_freq,class_count,X_test,y_test)

    print("Train:Test =",str(int((1-splitsize)*100))+":"+str(int(splitsize*100)))
    acc = accuracy_score(actual,predicted)
    print("Accuracy =",acc)
    confusion = confusion_matrix(actual,predicted)
    print("Confusion Matrix")
    print(confusion)
    print()

Train:Test = 80:20
Accuracy = 0.971
Confusion Matrix
[[207   1   1   0   1]
 [  0 215   0   0   1]
 [  5   1 175   4   7]
 [  2   1   1 185   1]
 [  1   1   0   1 189]]

Train:Test = 70:30
Accuracy = 0.9713333333333334
Confusion Matrix
[[295   1   2   1   2]
 [  1 312   0   0   3]
 [  5   1 269   4  11]
 [  3   1   1 300   4]
 [  1   1   0   1 281]]

Train:Test = 50:50
Accuracy = 0.9744
Confusion Matrix
[[475   1   0   1   1]
 [  2 501   1   0   2]
 [ 13   2 462   5  10]
 [ 10   3   1 494   8]
 [  1   1   0   2 504]]



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5be6439c-c857-493e-9eef-529c4c49e2bf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>