In [2]:
from collections import Counter
import numpy as np
import pandas as pd
import stop_words

class NLP():
    """
    A NLP class to perform count_vectorizer.
    """
    
    def __init__(self):
        self.vocab = None
    
    def count_vectorizer(self, text, train = True, stop_word=None, view=False):
        """
            TODO:
                * Better preprocessing using regex, remove numbers.
            Inputs:
                text: Input data as list of Text.
                stop_words: List or array of stop words. If none, default used.

            Outputs:
                Dataframe of count_vector

            Steps:
                * Lowercase applied
                * Punctuation removed
                * Removed stop words
                * Performed bag of words
                * Frequency of words
                * Dataframe of frequency of words
        
        """


        lower_case_documents = []
        documents=text
        for i in documents:
            lower_case_documents.append(i.lower())
        
        if view:
            print('Step: Applying Lower Case.... Done\n')
    #     print(lower_case_documents)
        sans_punctuation_documents = []
        
        import string

        for i in lower_case_documents:
            punctuation = string.punctuation

            k = ""
            for j in i:
                if j not in punctuation:
                    k+=j
                    
            sans_punctuation_documents.append(k)
        
        if view:
            print('Step: Removed Punctuation....\n')
    #     print(sans_punctuation_documents)
        
        if stop_word == None:
            stop_word = list(stop_words.ENGLISH_STOP_WORDS)
        
        preprocessed_documents = []
        for i in sans_punctuation_documents:
            sentence = []
            for word in i.split():
                if word not in stop_word:
                    sentence.append(word)
            preprocessed_documents.append(sentence)
        
        if train != True:
            return preprocessed_documents
        
        if view:
            print('Step: Bag of Words... Done\n')
    #     print(preprocessed_documents)

        frequency_list = []
        from collections import Counter

        for i in preprocessed_documents:
            frequency_list.append(dict(Counter(i)))
        
        if view:
            print('Step: Frequency of words... Done\n')
        
        # often called as vocabulary
        all_words = list(set([j for i in preprocessed_documents for j in i]))

        for doc in frequency_list:
            for word in all_words:
                if word not in list(doc.keys()):
                    doc[word] = 0
        df = pd.DataFrame(frequency_list)
        df = df[sorted(list(df.columns))]
        
        self.vocab = df.columns.to_list()
        
        if view:
            print('Step: Count vectorizer... Done\n')
#         print(df.head())
        return df

nlp = NLP()
documents = ['hello there, we won again. Cool.', 'I will be available there', 'and again we won']
count_vector = nlp.count_vectorizer(documents)
count_vector

Unnamed: 0,available,cool,hello,won
0,0,1,1,1
1,1,0,0,0
2,0,0,0,1


In [8]:
import pandas as pd
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table('SMSSpamCollection', names=['label', 'sms_message'])

df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
import numpy as np
df['label'] = np.array(df['label']=='spam', dtype=np.int32)
df.head()


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)


In [12]:
nlp.count_vectorizer(list(X_test[:10]))

Unnamed: 0,2,apparently,ask,buy,card,come,da,dear,forgot,going,...,tuesday,u,wan,want,wat,welp,write,yep,yes,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,1,0,1,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,2
5,1,0,0,0,0,1,0,0,0,0,...,0,1,1,0,1,0,0,0,0,0
6,4,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
spam = ['win money now', 'easy moneey now', 'win the money by replying']
ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']
# spam = ['money', "easy", 'money']
# ham = ["money", "easy", ]
all_txt = spam + ham

bow = nlp.count_vectorizer(all_txt)
classes = [0, 0, 0, 1, 1, 1, 1, 1]
bow['out'] = classes

bow_class = bow.groupby(by='out', axis=0)

# count of each class examples
counts = bow_class.count()
print(counts)

# count of each word on each class
count_words_class = bow_class.sum()
print(count_words_class)

# find p(word/class)
prob_w_c = bow_class.sum() / counts
print(prob_w_c)

# find p(class/word)
prob_c_w = round(prob_w_c * counts / counts.sum(axis=0), 5)
prob_c_w

     borrow  boy  buddy  easy  game  good  hello  hi  moneey  money  replying  \
out                                                                             
0         3    3      3     3     3     3      3   3       3      3         3   
1         5    5      5     5     5     5      5   5       5      5         5   

     win  
out       
0      3  
1      5  
     borrow  boy  buddy  easy  game  good  hello  hi  moneey  money  replying  \
out                                                                             
0         0    0      0     1     0     0      0   0       1      2         1   
1         1    1      1     1     1     1      1   1       0      1         0   

     win  
out       
0      2  
1      0  
     borrow  boy  buddy      easy  game  good  hello   hi    moneey     money  \
out                                                                             
0       0.0  0.0    0.0  0.333333   0.0   0.0    0.0  0.0  0.333333  0.666667   
1       0.2  0.2   

Unnamed: 0_level_0,borrow,boy,buddy,easy,game,good,hello,hi,moneey,money,replying,win
out,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.25,0.125,0.25
1,0.125,0.125,0.125,0.125,0.125,0.125,0.125,0.125,0.0,0.125,0.0,0.0


In [16]:
txt = ['you are good boy. boy need money?',"boy need money?" ]
txt = nlp.count_vectorizer(txt, train=False)
p = prob_c_w
txt[0]

['good', 'boy', 'boy', 'need', 'money']

In [8]:
txt = ['easy boy easy. boy you need easy money?', "it was easy game" ]

txt = nlp.count_vectorizer(txt, train= False)
# words = txt.columns.to_list()
words = dict(Counter(txt[0]))
vocab = p.columns.to_list()

classes = [0, 1]
class_prob = counts / counts.sum(axis=0)
class_prob = dict(class_prob.mean(axis=1))

# probs will store denominator value for each class. We have to add all values of it to get denominator
probs = {}

# numinator

num = {k:v for k,v in class_prob.items()}

for w in words.keys():
    if w in vocab:
        for c in classes:
            if probs.get(c) != None:
                if p[w][c] != 0:
                    probs[c] += p[w][c] * class_prob[c]
                    num[c] *= p[w][c]
            else:  
                probs[c] = p[w][c] * class_prob[c] 
                num[c] *= p[w][c]


probs, num

({0: 0.140625, 1: 0.234375}, {0: 0.01171875, 1: 0.001220703125})

In [9]:
txt = ['easy boy easy. boy you need easy money?', "hello it was easy game" ]
# txt = ['Win the easy money now', 'I am missing you buddy']

txt = nlp.count_vectorizer(txt, train= False)
# words = txt.columns.to_list()
print(txt)
words = dict(Counter(txt[1]))
vocab = p.columns.to_list()

classes = [0, 1]
class_prob = counts / counts.sum(axis=0)
class_prob = dict(class_prob.mean(axis=1))

# probs will store denominator value for each class. We have to add all values of it to get denominator
# probs will store values of P(w/c) where c is classes and w is words.
probs = {}

# numinator
# same as probs

num = {k:v for k,v in class_prob.items()}

for w in words.keys():
    if w in vocab:
        for c in classes:
            if probs.get(c) != None:
                if p[w][c] != 0:
                    probs[c] *= p[w][c] 
                    num[c] *= p[w][c]
            else:  
                probs[c] = p[w][c] * class_prob[c] 
                num[c] *= p[w][c]

# to find probability of class given word or P(c/w), we have formula
# = p(w/c) * p(c) / p(w)
# p(w) = sum over all p(w/c) * p(c) is TP + TN 
                
denom = sum(probs.values())
probs = {k: v/denom for k,v in num.items()}
sum(probs.values()), probs

[['easy', 'boy', 'easy', 'boy', 'need', 'easy', 'money'], ['hello', 'easy', 'game']]


(1.0, {0: 0.0, 1: 1.0})

In [22]:
class NaiveBayes():
    """
        A class to perform Naive Bayes on text.
        Methods:
            * fit: to train a model
            * predict: to do prediction

        Use Cases:
        spam = ['win money now', 'easy moneey now', 'win the money by replying']
        ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']

        all_txt = spam + ham
        classes = [0, 0, 0, 1, 1, 1, 1, 1]
        nb = NaiveBayes(all_txt, classes)
        nb.fit()

        test = ['easy boy easy. boy you need easy money?', "it was easy game" ]
        nb.predict([spam[1]])
        # {0: 0.06976744186046512, 1: 0.9302325581395349}
    """
    def __init__(self, text, label):
        self.text = text
        self.label = label
        self.cond_probs = {}
        self.features = []
        self.classes = []
        self.class_prob = {}
    
    def fit(self, view=False):
        """
            Input: List of texts.
            
            A method to find all the probability of P(word/class).
            It finds out the probabilty for each word to be on each class.
            Example:
            --------
                spam = ['win money now', 'easy moneey now', 'win the money by replying']
                ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']

                all_txt = spam + ham
                classes = [0, 0, 0, 1, 1, 1, 1, 1]
                nb = NaiveBayes(all_txt, classes)
                nb.fit()
                
            Steps:
            ---------
            * Find the BoW
            * Find the examples on each class
            * Find the probability of word on class p(w/c)
            * Find the probability of class given word. P(c/w)
        
        """
        
        text = self.text
        label = self.label
        
        
        bow = nlp.count_vectorizer(text)
        
        self.features = bow.columns.to_list() 
        
        if view:
            print('Your BoW is:\n', bow)
            
        classes = label
        
        self.classes = list(Counter(classes).keys())
        
        bow['out'] = classes
        bow_class = bow.groupby(by='out', axis=0)

        # count of each class examples
        counts = bow_class.count()
        
        # used for prediction
        class_prob = counts / counts.sum(axis=0)
        class_prob = dict(class_prob.mean(axis=1))
        self.class_prob = class_prob
        
        # count of each word on each class
        self.count_words_class = bow_class.sum()

        # find prob of word in each class.... no. of that word in class / total word in class
        prob_w_c = bow_class.sum() / counts
        
        # find p(word/class)
        
        prob_w_c = round(prob_w_c * counts / counts.sum(axis=0), 5)
        self.cond_probs = prob_w_c
        
    def classes_(self):
        """
        A method to see all classes counts for each word.
        """
        return self.count_words_class 
    
    def predict(self, example):
        """
            A method for prediction.
            Input: List of text. 
            Output: Prediction for each classes.
            
            Example:
            ----------
            
            >>>test = ['easy boy easy. boy you need easy money?', "it was easy game" ]
            >>>nb.predict([spam[1]])
            {0: 0.06976744186046512, 1: 0.9302325581395349}
        """
        txt = nlp.count_vectorizer(example, train= False)
        words = dict(Counter(txt[0]))
        
        vocab = self.features
        classes = self.classes
        class_prob = self.class_prob
        p = self.cond_probs
        
        # probs will store denominator value for each class. We have to add all values of it to get denominator
        # probs will store values of P(w/c) where c is classes and w is words.
        probs = {}

        # numinator
        # same as probs

        num = {k:v for k,v in class_prob.items()}
        
        
        """
        c, ~c
        p(~c/w1, w2, w3) = p(w1, w2, w3 / ~c) * p(~c) / (p(w1, w2, w3/c) * p(c) + p(w1, w2, w3/~c) * p(~c))
        
        
        p(c/ w1, w1, w3) = p(w1, w2, w3 / c) * p(c) / (p(w1, w2, w3/c) * p(c) + p(w1 , w2, w3 / ~c) * p(~c))
        p(w1, w2, w3/c) = p(w1/c) * p(w2/c) * p(w3/c) = p(w1 and w2 and w3 / c)
        """
        
        for w in words.keys():
            if w in vocab:
                for c in classes:
                    if probs.get(c) != None:
                        if p[w][c] != 0:
                            probs[c] *= p[w][c] 
                            num[c] *= p[w][c]
                    else:  
                        probs[c] = p[w][c] * class_prob[c] 
                        num[c] *= p[w][c]

        # to find probability of class given word or P(c/w), we have formula
        # = p(w/c) * p(c) / p(w)
        # p(w) = sum over all p(w/c) * p(c) is TP + TN 

        denom = sum(probs.values())
        probs = {k: v/denom for k,v in num.items()}
        return probs

        """     
            ### scarp code
                #         sum(probs.values()), probs
                #         example = nlp.count_vectorizer(example)
                #         words = example.columns.to_list()

                #         vocab = self.features
                #         classes = self.classes
                #         p = self.cond_probs

                #         probs = {}

                #         class_prob = self.class_prob

                #         for w in words:
                #             if w in vocab:
                #                 for c in classes:
                #                     if probs.get(c) != None:
                #                         # this is actually p(class/word) * p(class)
                #                         probs[c] *= p[w][c]
                #                     else:
                #                         probs[c] = p[w][c] * class_prob[c]

                #         return probs

        """
        
        
spam = ['do your homework by tomorrow', 'deadline on thursday', 'R for data science assignment complete by sunday']
ham = ['can you borrow money', 'good boy', 'it was easy game', 'hello buddy', 'hi']

all_txt = spam + ham
classes = [0, 0, 0, 1, 1, 1, 1, 1]
nb = NaiveBayes(all_txt, classes)
nb.fit()

# nb.cond_probs
test = ['complete your assignment by paying money?', "buddy, easy money", 'win easy money by replying']

for txt in test:
    pred = np.array(list(nb.predict([txt]).values())).argmax()
    pred = ["Spam", "Ham"][pred]
    print(f"Sentence: {txt}\nPrediction: {pred}")


Sentence: complete your assignment by paying money?
Prediction: Spam
Sentence: buddy, easy money
Prediction: Ham
Sentence: win easy money by replying
Prediction: Ham


In [12]:
all_txt

['win money now',
 'easy moneey now',
 'win the money by replying',
 'can you borrow money',
 'good boy',
 'it was easy game',
 'hello buddy',
 'hi']

In [13]:
#### prediction
pred = []
test = ['Win the easy money now', 'I am missing you buddy']
test = ['easy boy easy. boy you need easy money?', "it was easy game" ]

nb.predict([spam[0]])

{0: 1.0, 1: 0.0}

In [18]:
from naive_bayes import NaiveBayes

nb = NaiveBayes(all_txt, classes)
nb.fit()

# nb.cond_probs
test = ['easy boy easy. boy you need easy money?', "buddy, easy money", 'win easy money by replying']
nb.predict([test[0]])

{0: 0.9056603773584906, 1: 0.09433962264150944}

\begin{equation}\label{eq:}
P(s/w1, w2, w3....wn) = \frac{P(w1, w2.....wn/s) * P(s)}{P(w1, w2, w3...wn)}\\
P(w1, w2.....wn/s) = P(w1/s) * P(w2/s) ...P(wn/s)\\
\therefore P(s/w1, w2, w3....wn) = \frac{P(s) * P(w1/s) * P(w2/s) ...P(wn/s)} {P(s) * P(w1/s) * P(w2/s) ...P(wn/s) + P(h) * P(w1/h) * P(w2/~h) ...P(wn/h)} \\
or \frac{P(s)*\prod_{i=1}^{n}{P(w_i/s)}} {\sum_{c=s, h}{P(c)*\prod_{i=1}^{n}{P(w_i/c)}}}
\end{equation}


\begin{equation}
\frac{P(s)*\prod_{i=1}^{n}{P(w_i/s)}} {\sum_{c=s, h}{P(c)*\prod_{i=1}^{n}{P(w_i/c)}}}
\end{equation}
