In [1]:
from os import listdir
from os.path import isfile, join
import string

In [2]:
my_path = '20_newsgroups'
folders = [f for f in listdir(my_path) if f!='.DS_Store']
# list of folder names
folders

['talk.politics.mideast',
 'rec.autos',
 'comp.sys.mac.hardware',
 'alt.atheism',
 'rec.sport.baseball',
 'comp.os.ms-windows.misc',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.med',
 'talk.politics.misc',
 'rec.motorcycles',
 'comp.windows.x',
 'comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'sci.electronics',
 'talk.politics.guns',
 'sci.space',
 'soc.religion.christian',
 'misc.forsale',
 'talk.religion.misc']

In [3]:
# 2D list to store list of all files in different folders
files = []
for folder_name in folders:
    if folder_name == '.DS_Store':
        continue
    folder_path = join(my_path, folder_name)
    files.append([f for f in listdir(folder_path)])

In [4]:
len(folders)

20

In [5]:
# list of pathnames of all the documents
pathname_list = []
for fo in range(len(folders)):
    for fi in files[fo]:
        pathname_list.append(join(my_path, join(folders[fo], fi)))

In [6]:
pathname_list[0:5]

['20_newsgroups/talk.politics.mideast/75895',
 '20_newsgroups/talk.politics.mideast/76248',
 '20_newsgroups/talk.politics.mideast/76277',
 '20_newsgroups/talk.politics.mideast/76045',
 '20_newsgroups/talk.politics.mideast/77197']

In [7]:
# array containing the classes each of the documents belong to
Y = []
for folder in folders:
    folder_path = join(my_path, folder)
    num_of_files = len(listdir(folder_path))
    for i in range(num_of_files):
        Y.append(folder)
len(Y)

19997

In [8]:
Y[0:10]

['talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast',
 'talk.politics.mideast']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# Splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(pathname_list, Y, random_state=0, test_size=0.25)

In [11]:
# Listing common stopwords
import string
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [12]:
# Function for Pre Processing of the data
def pre_process_docs(words):
    table = str.maketrans('', '', '\t') # Removes unnecessary tabs
    words = [word.translate(table) for word in words]
    punctuations = (string.punctuation).replace("'", "") # Removal of words with ' punctuation
    trans_table = str.maketrans('', '', punctuations)
    stripped_words = [word.translate(trans_table) for word in words]
    words = [str for str in stripped_words if str] # Removal of blank words
    p_words = [] #some words are quoted in the documents & as we have not removed ' to maintain the integrity of some stopwords
    for word in words:
        if(word[0] and word[len(word) - 1] == "'"):
            word = word[1: len(word)-1]
        elif(word[0] == "'"):
            word = word[1: len(word)]
        else:
            word = word
        p_words.append(word)
    words = p_words.copy()
    words = [word for word in words if not word.isdigit()] # Removal of words which contains only digits
    words = [word for word in words if not len(word) == 1] # Removal of words with single character
    words = [str for str in stripped_words if str]  # Removal of blank words
    words = [word.lower() for word in words]
    words = [word for word in words if not len(word) < 3] # Removal of words with length less than 3
    words = [str for str in stripped_words if str] # Removal of blank words
    return words

In [13]:
# Function To Remove Stopwords
def remove_stopwords(words):
    words = [word for word in words if not word in stop_words]
    return words

In [14]:
# Function to convert sentence to arry of words
def tokenize_sentence(line):
    words = line[0:len(line)-1].strip().split(" ")
    words = pre_process_docs(words)
    words = remove_stopwords(words)
    return words

In [15]:
# Remove meta data
def remove_metadata(lines):
    start=0
    for i in range(len(lines)):
        if(lines[i] == '\n'):
            start = i+1
            break
    new_lines = lines[start:]
    return new_lines

In [16]:
#function to convert a document into list of words
def tokenize(path):
    f = open(path, 'rb')
    text_lines = [x.decode('latin-1').strip() for x in f.readlines()]
    text_lines = remove_metadata(text_lines) #removing the meta-data at the top of each document
    doc_words = []
    for line in text_lines: #traverse over all the lines and tokenize each one with the help of helper function: tokenize_sentence
        doc_words.append(tokenize_sentence(line))
    return doc_words

In [17]:
# Convert 2d array to 1d array
def flat(list):
    new_list = []
    for i in list:
        for j in i:
            new_list.append(j)
    return new_list

In [18]:
list_of_words = []
for document in x_train:
    list_of_words.append(flat(tokenize(document)))

In [19]:
len(list_of_words)

14997

In [20]:
len(flat(list_of_words))

2923305

In [21]:
import numpy as np
np_list_of_words = np.asarray(flat(list_of_words))

In [22]:
#finding the number of unique words that we have extracted from the documents
words, count = np.unique(np_list_of_words, return_counts=True)

In [23]:
len(words)

266080

In [24]:
#sorting the unique words according to their frequency
freq, words = (list(i) for i in zip(*(sorted(zip(count, words), reverse = True))))

In [25]:
freq_of_word = []
num_of_word = []
for f in sorted(np.unique(freq), reverse=True):
    freq_of_word.append(f)
    num_of_word.append(f)

In [26]:
#deciding the no. of words to use as feature
n = 5555
features = words[0:n]
print(features)



In [27]:
#creating a dictionary that contains each document's vocabulary and ocurence of each word of the vocabulary 
dictionary = {}
doc_num = 1
for doc_words in list_of_words:
    #print(doc_words)
    np_doc_words = np.asarray(doc_words)
    w, c = np.unique(np_doc_words, return_counts=True)
    dictionary[doc_num] = {}
    for i in range(len(w)):
        dictionary[doc_num][w[i]] = c[i]
    doc_num = doc_num + 1

In [28]:
x_train_doc = []
for k in dictionary.keys():
    row=[]
    for f in features:
        if(f in dictionary[k].keys()):
            #if word f is present in the dictionary of the document as a key, its value is copied
            #this gives us no. of occurences
            row.append(dictionary[k][f])
        else:
            row.append(0)
    x_train_doc.append(row)

In [29]:
#we convert the X and Y into np array for concatenation and conversion into dataframe
x_train_doc = np.asarray(x_train_doc)
y_train = np.asarray(y_train)

In [30]:
list_of_words_test = []

for document in x_test:
        list_of_words_test.append(flat(tokenize(document)))

dictionary_test = {}
doc_num = 1
for doc_words in list_of_words_test:
    #print(doc_words)
    np_doc_words = np.asarray(doc_words)
    w, c = np.unique(np_doc_words, return_counts=True)
    dictionary_test[doc_num] = {}
    for i in range(len(w)):
        dictionary_test[doc_num][w[i]] = c[i]
    doc_num = doc_num + 1

x_doc_test = []
for k in dictionary_test.keys():
    row = []
    for f in features:
        if(f in dictionary_test[k].keys()):
            #if word f is present in the dictionary of the document as a key, its value is copied
            #this gives us no. of occurences
            row.append(dictionary_test[k][f]) 
        else:
            #if not present, the no. of occurences is zero
            row.append(0)
    x_doc_test.append(row)

x_doc_test = np.asarray(x_doc_test)
y_test = np.asarray(y_test)

In [31]:
# Implementing inbuilt naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train_doc, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
y_predict = clf.predict(x_doc_test)

In [33]:
clf.score(x_doc_test, y_test)

0.865

In [34]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_predict))

                          precision    recall  f1-score   support

             alt.atheism       0.74      0.80      0.77       240
           comp.graphics       0.77      0.78      0.77       244
 comp.os.ms-windows.misc       0.85      0.88      0.86       240
comp.sys.ibm.pc.hardware       0.88      0.89      0.89       256
   comp.sys.mac.hardware       0.87      0.92      0.90       249
          comp.windows.x       0.93      0.85      0.89       233
            misc.forsale       0.75      0.92      0.83       259
               rec.autos       0.91      0.92      0.92       253
         rec.motorcycles       0.91      0.95      0.93       231
      rec.sport.baseball       0.95      0.95      0.95       236
        rec.sport.hockey       0.98      0.96      0.97       261
               sci.crypt       0.96      0.91      0.93       269
         sci.electronics       0.84      0.87      0.86       246
                 sci.med       0.96      0.87      0.92       284
         

In [35]:
y_predict_train = clf.predict(x_train_doc)

In [36]:
clf.score(x_train_doc, y_train)

0.9040474761618991

In [37]:
print(classification_report(y_train, y_predict_train))

                          precision    recall  f1-score   support

             alt.atheism       0.81      0.89      0.85       760
           comp.graphics       0.88      0.86      0.87       756
 comp.os.ms-windows.misc       0.89      0.92      0.91       760
comp.sys.ibm.pc.hardware       0.90      0.92      0.91       744
   comp.sys.mac.hardware       0.93      0.95      0.94       751
          comp.windows.x       0.96      0.91      0.93       767
            misc.forsale       0.77      0.94      0.85       741
               rec.autos       0.91      0.96      0.93       747
         rec.motorcycles       0.93      0.98      0.95       769
      rec.sport.baseball       0.98      0.97      0.97       764
        rec.sport.hockey       0.98      0.97      0.97       739
               sci.crypt       0.98      0.91      0.95       731
         sci.electronics       0.90      0.92      0.91       754
                 sci.med       0.98      0.90      0.93       716
         

### Self Implementations

In [38]:
#function to create a training dictionary out of the text files for training set, consisiting the frequency of
#words in our feature set (vocabulary) in each class or label of the 20 newsgroup
def fit(x_train, y_train):
    result = {}
    classes, counts = np.unique(y_train, return_counts=True)
    for i in range(len(classes)):
        curr_class = classes[i]
        result["Total_Data"] = len(y_train)
        result[curr_class] = {}
        x_train_curr = x_train[y_train == curr_class]
        num_of_features = len(x_train[0])
        for j in range(num_of_features):
            result[curr_class][features[j]] = x_train_curr[:, j].sum()
        result[curr_class]["Total_Count"] = counts[i]
    return result

In [39]:
#function for calculating naive bayesian log probablity for each test document being in a particular class

def log_probab(dictionary_train, x, curr_class):
    output = np.log(dictionary_train[curr_class]["Total_Count"]) - np.log(dictionary_train["Total_Data"])
    num_words = len(x)
    for j in range(num_words):
        if(x[j] in dictionary_train[curr_class].keys()):
            xj = x[j]
            count_curr_class_xj = dictionary_train[curr_class][xj] + 1
            count_curr_class = dictionary_train[curr_class]["Total_Count"] + len(dictionary_train[curr_class].keys())
            curr_xj_probab = np.log(count_curr_class_xj) - np.log(count_curr_class)
            output = output + curr_xj_probab
        else:
            continue
    return output

In [40]:
#helper function for the predict() function that predicts the class or label for one test document at a time

def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == "Total_Data":
            continue
        p_current_class = log_probab(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [41]:
#predict function that predicts the class or label of test documents using train dictionary made using the fit() function
def predict(dictionary, X_test):
    Y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x)
        Y_pred.append(x_class)
    return Y_pred

In [42]:
train_dict = fit(x_train_doc, y_train)

In [43]:
x_test = []
for key in dictionary_test.keys():
    x_test.append(list(dictionary_test[key].keys()))

In [44]:
my_pred = predict(train_dict, x_test)

In [45]:
my_pred = np.asarray(my_pred)
accuracy_score(y_test, my_pred)

0.6958

In [46]:
print(classification_report(y_test, my_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.70      0.73       240
           comp.graphics       0.66      0.80      0.72       244
 comp.os.ms-windows.misc       0.93      0.57      0.70       240
comp.sys.ibm.pc.hardware       0.90      0.67      0.77       256
   comp.sys.mac.hardware       0.95      0.67      0.78       249
          comp.windows.x       0.73      0.89      0.80       233
            misc.forsale       0.96      0.53      0.68       259
               rec.autos       0.95      0.55      0.70       253
         rec.motorcycles       1.00      0.38      0.55       231
      rec.sport.baseball       0.99      0.79      0.88       236
        rec.sport.hockey       0.99      0.86      0.92       261
               sci.crypt       0.74      0.90      0.81       269
         sci.electronics       0.95      0.47      0.63       246
                 sci.med       0.96      0.63      0.76       284
         