## 8. Text Classification Using Multinomial Naive Bayes

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords

### (c) Calculate Pi, the fraction of documents that belong to each class j

In [2]:
# define function to calculate pi
def fraction_doc_classj(labels):
    """
    Find the class probabilities pi for each class j
    """
    
    # define dictionary for pi_j where j is each class 1,2,..., 20
    pi = {}
    No_Classes = 20
    for j in range(1,No_Classes+1):
        pi[j] = 0

    # count occurrences for a dataframe type variable
    if isinstance(labels, pd.core.series.Series):
        lines = list(labels)
    else:
        lines = labels.readlines()
    
    # count the occurrence of each class j
    for line in lines:
        if isinstance(labels, pd.core.series.Series):
            j_val = line
        else:        
            j_val = int(line.split()[0])
        pi[j_val] += 1

    # divide each class count for the total number of documents
    for j in pi.keys():
        pi[j] /= len(lines)
    return pi

In [3]:
# read labels for train data
train_label = open('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.label', 'r')

pi = fraction_doc_classj(train_label)
print(pi)

{1: 0.04259472890229834, 2: 0.05155736977549028, 3: 0.05075871860857219, 4: 0.05208980388676901, 5: 0.051024935664211554, 6: 0.052533498979501284, 7: 0.051646108794036735, 8: 0.052533498979501284, 9: 0.052888455053687104, 10: 0.0527109770165942, 11: 0.05306593309078002, 12: 0.0527109770165942, 13: 0.05244475996095483, 14: 0.0527109770165942, 15: 0.052622237998047744, 16: 0.05315467210932647, 17: 0.04836276510781791, 18: 0.05004880646020055, 19: 0.04117490460555506, 20: 0.033365870973467035}


### (c) Create dataframe for training data and training labels

In [4]:
# create function to merge both
def merge_data_labels_df(data, labels):
    """
    Create dataframe by increasing size of labels to match data
    """
    # get array for documents and class indexes
    docIdx = train_data['docIdx']
    classIdx = train_labels['classIdx']

    # match data and label size by increasing label length
    new_train_labels = []
    i = 0
    for idx in range(len(docIdx)-1):
        new_train_labels.append(classIdx[i])
        if docIdx[idx] != docIdx[idx+1]:
            i += 1
    new_train_labels.append(classIdx[i])

    # create dataframe with both train and label
    df = train_data
    df['classIdx'] = new_train_labels
    
    return df

In [5]:
# read train data and labels
train_data = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.data', 
                         delimiter=' ', names=['docIdx', 'wordIdx', 'count'])
train_labels = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/train.label', 
                           names=['classIdx'])

df = merge_data_labels_df(train_data,train_labels)

df.head()

Unnamed: 0,docIdx,wordIdx,count,classIdx
0,1,1,4,1
1,1,2,2,1
2,1,3,10,1
3,1,4,4,1
4,1,5,2,1


### (c) Use Laplace Smoothing to calculate the probability of each word per class, Pjw

In [6]:
def Prob_word_per_class(df, remove_stopwords, stopwordsIdx):
    """
    Find Pjw and apply Laplace smoothing with alpha parameter
    """
    
    # remove stopwords if wanted
    if remove_stopwords:
        df = df[~df.wordIdx.isin(idx)]
    
    # parameters
    alpha = 0.001
    No_class = 20
    
    # calculate the fraction of the concatenated doccuments occupied by w
    word_wj = df.groupby(['classIdx','wordIdx'])
    word_j = df.groupby(['classIdx'])
    Pr_jw =  (word_wj['count'].sum() + alpha) / (word_j['count'].sum() + 61188 + 1)    
    Pr_jw = Pr_jw.unstack()
                
    # replace missing values with the constant alpha/(count+|V|+1)
    # where 'count' is how often w occurs and |V| is the size of the vocabulary
    for j in range(1,No_class+1):
        Pr_jw.loc[j,:] = Pr_jw.loc[j,:].fillna(alpha/(word_j['count'].sum()[j] + 61188 + 1))

    return Pr_jw.to_dict()

In [7]:
Pr_jw = Prob_word_per_class(df, remove_stopwords=False, stopwordsIdx = [])
len(Pr_jw)

53975

### (d) Routine with Naive Bayes to Classify a New Document

In [8]:
def Multinomial_NB(df, pi, Pr_jw, log_replacement = False):
    """
    Model Function for Multinomial Naive Bayes
    
    Inputs:
    - df: columns = ['docIdx', 'wordIdx', 'count']
    
    Model Equation (in Log scale):
    armax_j  log(pi_j) + Sigma Sum (from w = 1 to |V|) X_i * log (Pr_jw)
    
    where,
    - pi_j = fraction of documents that belong to that class j
    - Pr_jw = fraction of each word for a given class
    - j = class
    - w = each word from vocabulary
    - |V| = lenght of vocabulary
    """
    
    # convert dataframe to dict with format for faster speed
    df_dict = df.to_dict()
    data_dict = {}
    for df_row in df_dict['docIdx'].keys():
        doc_index = df_dict['docIdx'][df_row]
        word_index = df_dict['wordIdx'][df_row]
    
        try: 
            data_dict[doc_index][word_index] = df_dict['count'][df_row]
        except:
            data_dict[df_dict['docIdx'][df_row]] = {}
            data_dict[doc_index][word_index] = df_dict['count'][df_row]

    # use equation to find the score and take arg max as the prediction
    predicted = []
    for doc in data_dict.keys(): # loop over every document
        score_dict = {}     
        No_class = 20
        for class_ in range(1,No_class+1): # calculate a score for each class
            score_dict[class_] = 1 # initialize score value
            for word in data_dict[doc]: # loop over every word in the document
               
                # calculate right-hand side (Sigma sum) of the model equation
                try:
                    # calculate with log replacement log(1+f)
                    if log_replacement:
                        score_dict[class_] += np.log(1+data_dict[doc][word]) * np.log(Pr_jw[word][class_])
                    else: # else do just f
                        score_dict[class_] += data_dict[doc][word] * np.log(Pr_jw[word][class_])
                except:
                    # missing words in the vocabulary yield a zero score, X_i = 0
                    score_dict[class_] += 0     
                    
            # add left-hand side (log pi) of the model equation 
            score_dict[class_] +=  np.log(pi[class_])                          

        # get class with max probability in each document
        max_score = max(score_dict, key=score_dict.get)
        predicted.append(max_score)
        
    return predicted

### (e) Evaluate the performance of the model with test data

In [9]:
def predict_error(prediction, labels):
    """
    Find predicted error rate
    """
    correct = 0
    for i,j in zip(prediction, labels):
        if i == j:
            correct +=1
    
    perc_error = 100*(1-(correct/len(labels)))
    
    return round(perc_error,6)

In [10]:
test_data = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/test.data', 
                        delimiter=' ', names=['docIdx', 'wordIdx', 'count'])
test_labels = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/20news-bydate/matlab/test.label', 
                          names=['classIdx'])

test_labels = test_labels.classIdx.tolist()

predict = Multinomial_NB(test_data, pi, Pr_jw, log_replacement = False)

In [11]:
predict_error(predict, test_labels)

22.238508

### (f) split data into smaller training set and validation

In [12]:
train_data, val_data, train_labels, val_labels = train_test_split(df[['docIdx','wordIdx','count']], 
                                                                  df['classIdx'], test_size=0.2, random_state=1)

In [13]:
# separate validation set labels to match predicted size
new_val_labels = pd.DataFrame({'docIdx': val_data['docIdx'], 'classIdx':val_labels})
new_val_labels.drop_duplicates(keep = 'first', inplace = True)
new_val_labels = new_val_labels['classIdx']

### (f) implement strategies to improve earlier model
a) Replacing the frequency f of a word in a document by log(1 + f) <br>
b) Removing Stopwords

#### Different Models to evaluate with validation data
    1) frequency f
    2) frequency f and removing stop words
    3) frequency log(1+f)
    4) frequency log(1+f) and removing stop words

In [14]:
# import stopwords list from nltk libraries
# nltk.download()
stopwords_list = stopwords.words('english')

print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
# function to get the stop word indexes from vocabulary
def get_stopwords_idx(stopwords_list):
    """
    Use vocabulary .txt file from 20 Newsgroups data set
    """
    
    # read current vocabulary
    V = pd.read_csv('/Users/gio/Documents/DSE/2019-rgm001/DSE210/Lecture4/vocabulary.txt', names=['word'])
    
    # find stopwords index from the vocabulary
    stopwords_idx = V[V.word.isin(stopwords_list)].index

    return stopwords_idx

In [16]:
# Train the model
# define pi
pi = fraction_doc_classj(train_labels)

# define dataframe for train data
df_train = train_data
df_train['classIdx'] = train_labels 

# define probability of each word per class
Pr_jw = Prob_word_per_class(df_train, remove_stopwords=False, stopwordsIdx = [])
idx = get_stopwords_idx(stopwords_list)
Pr_jw_wo_stopwords = Prob_word_per_class(df_train, remove_stopwords=True, stopwordsIdx = idx)

In [17]:
# use validation dataset on all 4 models listed above

predict_1 = Multinomial_NB(val_data, pi, Pr_jw, log_replacement = False)
predict_2 = Multinomial_NB(val_data, pi, Pr_jw_wo_stopwords, log_replacement = False)
predict_3 = Multinomial_NB(val_data, pi, Pr_jw, log_replacement = True)
predict_4 = Multinomial_NB(val_data, pi, Pr_jw_wo_stopwords, log_replacement = True)

In [18]:
# print the error for each of the 4 model options above
models = ['frequency f', 'frequency f and removing stop words', 'frequency log(1+f)', 
          'frequency log(1+f) and removing stop words']
predictions = [predict_1, predict_2, predict_3, predict_4]

i = 1
for m, p in zip(models, predictions):
    error = predict_error(p, list(new_val_labels))
    print(str(i) + ') ' + str(m) + '\nError Rate = ' + str(error) + '\n')
    i += 1

1) frequency f
Error Rate = 28.425358

2) frequency f and removing stop words
Error Rate = 28.318663

3) frequency log(1+f)
Error Rate = 28.051925

4) frequency log(1+f) and removing stop words
Error Rate = 28.185294



### (f) Evaluate the final model on the test data

In [19]:
# use model #3 above "frequenct log(1+f)" as it gave the best error rate with the validation set
final_predict = Multinomial_NB(test_data, pi, Pr_jw, log_replacement = True)

In [20]:
# calculate error for final model
predict_error(final_predict, test_labels)

22.918055