In [1]:
#Routine stuff
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#Importing the training dataset into dataframe
df = pd.read_csv("train.csv",delimiter=",")
df = df.drop('Unnamed: 0',axis=1)
df.CLASSIFICATION = pd.Categorical(df.CLASSIFICATION)
df['CODE'] = df.CLASSIFICATION.cat.codes #Converting classes (names) to codes
df = df.drop('CLASSIFICATION',axis=1)
df.head()

Unnamed: 0,apr,gmt,1993,writes,references,article,sender,people,university,1,...,palestine,spread,purchased,clark,remains,sad,kenneth,propulsion,officer,CODE
0,1,2,2,0,0,0,0,6,3,1,...,0,0,0,0,0,0,0,0,0,0
1,1,2,3,0,0,6,0,21,0,0,...,0,3,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,2,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,1,1,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,1,1,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data = df.values #training dataframe to numpy array
data

array([[ 1,  2,  2, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0],
       ..., 
       [ 1,  1,  1, ...,  0,  0, 19],
       [ 1,  1,  1, ...,  0,  0, 19],
       [ 1,  1,  0, ...,  0,  0, 19]], dtype=int64)

In [4]:
y_train = data[:,-1] #separating ouput
y_train

array([ 0,  0,  0, ..., 19, 19, 19], dtype=int64)

In [5]:
x_train = data[:,:-1] #separating input
x_train.shape

(15997, 2000)

In [6]:
#FUNCTION TO FIT THE TRAINING DATA INTO THE MODEL & RETURN RESULTANT DICTIONARY
def fit (x_train, y_train):
    result = {}
    class_values = set(y_train) #set of distinct class values
    for current_class in class_values: #iterate over each class among all classes
        result[current_class] = {}
        current_class_rows = (y_train == current_class) #all row numbers of the current class considered
        x_train_current = x_train[current_class_rows] #all rows of the current class considered (using row numbers)
        total_count = 0 #count of total words appeared
        num_words = x_train.shape[1]
        for j in range(1, num_words+1): #iterate over all words in the vocab
            result[current_class][j] = x_train_current[:,j-1].sum() #count of number of times word j appears in current class of docs
            total_count += result[current_class][j]
        result[current_class]["total_count"] = total_count
    return result

In [7]:
#FUNCTION TO RETURN PROBABLITY OF THE DOCUMENT BELONGING TO THE CURRENT_CLASS OF DOCUMENTS SELECTED
def probability(dictionary, x, current_class):
    output = 1
    num_words = len(dictionary[current_class].keys())-1 #subtracting 1 for "total_count" attribute
    for j in range(1, num_words+1): #iterate over all words in the vocab
        if(x[j-1]!=0): #multiply (add in log) the probability of the current word when the word is present in the document
            count_current_class_current_word = dictionary[current_class][j] + 1
            count_current_class = dictionary[current_class]["total_count"] + num_words
            current_probability = np.log(count_current_class_current_word) - np.log(count_current_class)
            output = output + current_probability
    #print(output)
    return output

In [8]:
#FUNCTION TO PREDICT THE CLASS OF THE CURRENT DOCUMENT SELECTED
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    first_run = True
    for current_class in classes:
        if (current_class == "total_data"):
            continue
        p_current_class = probability(dictionary, x, current_class) #probability of belonging to the current_class of documents
        if (first_run or p_current_class > best_p):
            #assign/update best_class if running for the 1st time or if best probability yet
            best_p = p_current_class
            best_class = current_class
            first_run = False
    return best_class

In [9]:
#FUNCTION THAT PREDICTS THE CLASS OF THE TEST DATASET
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [10]:
#importing test dataset just like done for train dataset
df_test = pd.read_csv("test.csv",delimiter=",")
df_test = df_test.drop('Unnamed: 0',axis=1)
df_test.CLASSIFICATION = pd.Categorical(df_test.CLASSIFICATION)
df_test['CODE'] = df_test.CLASSIFICATION.cat.codes
df_test = df_test.drop('CLASSIFICATION',axis=1)
data_test = df_test.values
y_test = data_test[:,-1]
x_test = data_test[:,:-1]

In [11]:
x_test.shape

(4000, 2000)

In [12]:
x_train.shape

(15997, 2000)

In [13]:
dictionary = fit(x_train,y_train)
dictionary

{0: {1: 856,
  2: 733,
  3: 606,
  4: 997,
  5: 768,
  6: 796,
  7: 361,
  8: 824,
  9: 341,
  10: 125,
  11: 381,
  12: 272,
  13: 112,
  14: 200,
  15: 200,
  16: 122,
  17: 23,
  18: 130,
  19: 61,
  20: 909,
  21: 172,
  22: 117,
  23: 95,
  24: 51,
  25: 207,
  26: 135,
  27: 147,
  28: 42,
  29: 217,
  30: 30,
  31: 60,
  32: 239,
  33: 0,
  34: 68,
  35: 39,
  36: 22,
  37: 98,
  38: 22,
  39: 53,
  40: 10,
  41: 9,
  42: 291,
  43: 11,
  44: 88,
  45: 149,
  46: 117,
  47: 69,
  48: 50,
  49: 22,
  50: 104,
  51: 278,
  52: 85,
  53: 104,
  54: 89,
  55: 119,
  56: 30,
  57: 62,
  58: 23,
  59: 24,
  60: 18,
  61: 31,
  62: 60,
  63: 35,
  64: 98,
  65: 274,
  66: 162,
  67: 52,
  68: 29,
  69: 195,
  70: 9,
  71: 80,
  72: 2,
  73: 35,
  74: 208,
  75: 107,
  76: 55,
  77: 198,
  78: 14,
  79: 123,
  80: 69,
  81: 33,
  82: 47,
  83: 21,
  84: 43,
  85: 130,
  86: 2,
  87: 59,
  88: 41,
  89: 17,
  90: 25,
  91: 7,
  92: 342,
  93: 11,
  94: 15,
  95: 155,
  96: 49,
  97: 171,

In [14]:
y_train_pred = predict(dictionary, x_train)
print(classification_report(y_train,y_train_pred))
print(confusion_matrix(y_train,y_train_pred))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85       800
          1       0.88      0.87      0.88       800
          2       0.88      0.93      0.90       800
          3       0.93      0.89      0.91       800
          4       0.93      0.95      0.94       800
          5       0.96      0.91      0.94       800
          6       0.85      0.93      0.89       800
          7       0.91      0.96      0.93       800
          8       0.93      0.98      0.95       800
          9       0.94      0.98      0.96       800
         10       0.99      0.94      0.96       800
         11       0.98      0.92      0.95       800
         12       0.89      0.93      0.91       800
         13       0.95      0.92      0.94       800
         14       0.94      0.94      0.94       800
         15       0.98      1.00      0.99       797
         16       0.81      0.92      0.86       800
         17       0.95      0.89      0.92   

In [15]:
y_pred = predict(dictionary, x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.71      0.72      0.72       200
          1       0.75      0.78      0.76       200
          2       0.77      0.82      0.80       200
          3       0.92      0.78      0.85       200
          4       0.85      0.91      0.88       200
          5       0.92      0.77      0.84       200
          6       0.81      0.91      0.86       200
          7       0.85      0.94      0.89       200
          8       0.88      0.96      0.92       200
          9       0.93      0.96      0.95       200
         10       0.96      0.90      0.93       200
         11       0.93      0.82      0.88       200
         12       0.71      0.80      0.75       200
         13       0.88      0.85      0.86       200
         14       0.89      0.89      0.89       200
         15       0.94      0.99      0.97       200
         16       0.72      0.85      0.78       200
         17       0.93      0.81      0.86   