<a href="https://colab.research.google.com/github/samibahig/IFT6390/blob/main/kaggleBernoulli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import scipy.sparse as sp 
import pandas as pd 

import gc
import re, unicodedata
import string
import tqdm
from collections import defaultdict

train, test = pd.read_csv('/content/train.csv'), pd.read_csv('/content/test.csv')
print (train, test)

        Id  ...           Category
0        0  ...           astro-ph
1        1  ...             hep-ph
2        2  ...              cs.LG
3        3  ...            math.CO
4        4  ...  cond-mat.mes-hall
...    ...  ...                ...
7495  7495  ...        astro-ph.CO
7496  7496  ...        astro-ph.CO
7497  7497  ...             hep-th
7498  7498  ...            math.CO
7499  7499  ...             hep-th

[7500 rows x 3 columns]           Id                                           Abstract
0          0    We describe ways to define and calculate $L_...
1          1    The progenitor systems of Type-Ia supernovae...
2          2    OmegaWhite is a wide-field, high cadence, sy...
3          3    Given $n \geq 2$ and $1<p<n$, we consider th...
4          4    The motivation of this work is to improve th...
...      ...                                                ...
14995  14995    We investigate the ability of the Space Infr...
14996  14996    We study theoretically the 

# New Section

# New Section

# New Section

In [None]:
def remove_non_ascii(words):
    return ''.join([i if ord(i) < 128 else ' ' for i in words])

In [None]:
def process(df, t):
    df[t] = df[t].apply(lambda x : x.lower())
    #train['Abstract'] = train['Abstract'].apply(lambda x : remove_punctuation(x))
    df[t] = df[t].apply(lambda x : x.strip())
    df[t] = df[t].apply(lambda x : re.sub('\n', ' ', x))
    df[t] = df[t].apply(lambda x : re.sub('\[[^]]*\]', '', x))
    df[t] = df[t].apply(lambda x : re.sub("<.*?>", " ", x))
    df[t] = df[t].apply(lambda x : remove_non_ascii(x))
    df[t] = df[t].str.replace('[^\w\s]','')
    return df

In [None]:
t = train['Abstract']

train = process(train, 'Abstract')
test = process(test, 'Abstract')

train['Abstract']
print(train['Abstract'].shape)

(7500,)


In [None]:
class BernoulliVectorizer:
    def __init__(self):
        self.vocab = []
        self.vocab_counter = {}
    def build_vocab(self, data):
        for document in data:
            for word in document.split(' '):
                if word in self.vocab_counter:
                    self.vocab_counter[word] += 1
                    if self.vocab_counter[word] == 20:
                        self.vocab.append(word)
                else:
                    self.vocab_counter[word] = 1
    def transform(self, data):
        i = 0
        counter = 0 
        answer = np.zeros((len(data),len(self.vocab)))
        for document in data:
            counter = counter + 1
            token = document.split(' ')                                 
            bin_vect = np.zeros(len(self.vocab))
            for word_idx in range(len(self.vocab)):
                for e in token: 
                    if e == self.vocab[word_idx]:
                        bin_vect[word_idx ] = 1
            answer[i, :] = bin_vect
            i += 1
        return answer
    
    def fit_transform(self, data):
        self.build_vocab(data)
        print(len(self.vocab))
        return self.transform(data)

In [None]:
BV = BernoulliVectorizer()
#X_train = BV.fit_transform(train['Abstract'])
#test['Abstract'] = B.transform(test['Abstract'])
# train.head()
# train  = train.to_numpy() 

In [None]:
class BernoulliNB:
    def __init__(self, alpha):
        self.alpha = alpha   
    '''P(C_k) =  number of documents of that class / number of documents
       P(C_k / w1, w2, ...)\prop P(C_k) P(w1/C_k) P(w2/ C_k)
       P(wi / C_k) = number of documents of class C_k with w_i/ number of documents with that class 
       get all the rows of class C_k, how many of them was word w_i 
       get all the rows of class C_k, how many of them has a 1 in the index of w_i '''
    def fit(self, X, y):
        self.n_classes = len(np.unique(np.unique(y,return_inverse=True)[1]))
        n_classes = self.n_classes
        # calculate P(C_k) for all k
        self.counts = np.zeros(n_classes)
        y_cat = np.unique(y,return_inverse=True)[1]
        for i in y_cat:
            self.counts[i] += 1,
        self.counts /= len(y_cat)
     # generate n_features x n_classes matrix\n",   
        self.params = np.zeros((n_classes, X.shape[1]))
        for idx in range(len(X)):
            self.params[y_cat[idx]] += X[idx]
        self.params += self.alpha 
        class_sums = self.params.sum(axis=1) +  self.alpha * self.n_classes  #laplace
        self.params = self.params / class_sums[:, np.newaxis]
    def predict(self, X):
        neg_prob = np.log(1 - self.params)
        # compute neg_prob
        jll = np.dot(X, (np.log(self.params)-neg_prob).T)           
        jll += np.log(self.counts) + neg_prob.sum(axis=1)
        return np.argmax(jll, axis=1)

In [None]:
uni, cats = np.unique(train['Category'],return_inverse=True)

In [None]:
alpha = 1
NB = BernoulliNB(alpha)
X_train = BV.fit_transform(train['Abstract'])
NB.fit(X_train, train['Category'])
print('Test')
X_test = BV.transform(test['Abstract'])
Y_test = NB.predict(X_test)

4502
Test


In [None]:
pd.DataFrame(X_test).to_csv('Bernoulli4502')