In [328]:
import pandas as pd
import numpy as np
import re
import time

traingData = []
testData =[]

d1 = "In recent years, researchers, for computer vision, have proposed many deep learning (DL) methods for various tasks, and facial recognition (FR) made an enormous leap using these techniques." 
d2 = "Deep FR systems benefit from the hierarchical architecture of the deep learning methods to learn discriminative face representation."
d3 = "Computer vision methods have been widely used for facial recognition"
traingData.append([d1,"DL"])
traingData.append([d2,"DL"])
traingData.append([d3,"CV"])

#test data
d4 ="Deep learning based computer vision methods have been used for facial recognition."
testData.append([d4])

training_data = pd.DataFrame(traingData, columns=['Doc', 'Class'])
test_data = pd.DataFrame(testData, columns=['Doc'])

print(training_data)
training_data.head()

                                                 Doc Class
0  In recent years, researchers, for computer vis...    DL
1  Deep FR systems benefit from the hierarchical ...    DL
2  Computer vision methods have been widely used ...    CV


Unnamed: 0,Doc,Class
0,"In recent years, researchers, for computer vis...",DL
1,Deep FR systems benefit from the hierarchical ...,DL
2,Computer vision methods have been widely used ...,CV


In [329]:


# Training/Test split
training_set = training_data
test_set = test_data


In [330]:
#Remove all punctuation marks
training_set['Doc'] = training_set['Doc'].str.replace('\W', ' ')

#Change all letters to small case
training_set['Doc'] = training_set['Doc'].str.lower()

  training_set['Doc'] = training_set['Doc'].str.replace('\W', ' ')


In [331]:
#split the doc column on white space and convert each row to a list
training_set['Doc'] = training_set['Doc'].str.split()

vocabulary = []
for sms in training_set['Doc']:
    for word in sms:
        vocabulary.append(word)

#Set returns tuple of unique words
vocabulary = list(set(vocabulary))
len(vocabulary)
print(len(vocabulary))
print(vocabulary)

42
['facial', 'hierarchical', 'widely', 'deep', 'researchers', 'using', 'these', 'fr', 'of', 'dl', 'from', 'architecture', 'an', 'have', 'computer', 'learning', 'made', 'tasks', 'and', 'discriminative', 'for', 'benefit', 'been', 'techniques', 'to', 'enormous', 'face', 'used', 'systems', 'the', 'methods', 'years', 'recent', 'vision', 'various', 'proposed', 'in', 'learn', 'representation', 'leap', 'recognition', 'many']


In [332]:

#Create a default dictionary with each unique word a count of zero
word_counts_per_doc = {unique_word: [0] * len(training_set['Doc']) for unique_word in vocabulary}


In [333]:
#Loop over the training set and count the number of times each unique word occurs
for index, doc in enumerate(training_set['Doc']):
    for word in doc:
        word_counts_per_doc[word][index] += 1


In [334]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,facial,hierarchical,widely,deep,researchers,using,these,fr,of,dl,...,recent,vision,various,proposed,in,learn,representation,leap,recognition,many
0,1,0,0,1,1,1,1,1,0,1,...,1,1,1,1,1,0,0,1,1,1
1,0,1,0,2,0,0,0,1,1,0,...,0,0,0,0,0,1,1,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [335]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Doc,Class,facial,hierarchical,widely,deep,researchers,using,these,fr,...,recent,vision,various,proposed,in,learn,representation,leap,recognition,many
0,"[in, recent, years, researchers, for, computer...",DL,1,0,0,1,1,1,1,1,...,1,1,1,1,1,0,0,1,1,1
1,"[deep, fr, systems, benefit, from, the, hierar...",DL,0,1,0,2,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
2,"[computer, vision, methods, have, been, widely...",CV,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [336]:
# Isolating DL and CV documents first
dl_docs = training_set_clean[training_set_clean['Class'] == 'DL']
cv_docs = training_set_clean[training_set_clean['Class'] == 'CV']

# P(DL) and P(CV)
p_dl = len(dl_docs) / len(training_set_clean)
p_cv = len(cv_docs) / len(training_set_clean)

print("Probability of class DL:",p_dl)
print("Probability of class CV:",p_cv)

# N_DL, no of words in the DL doc, we are getting the words in mega document in DL class
n_words_per_dl_doc = dl_docs['Doc'].apply(len)
#print(n_words_per_dl_doc)
#print(dl_docs)
n_dl = n_words_per_dl_doc.sum()

# N_CV
n_words_per_cv_doc = cv_docs['Doc'].apply(len)
n_cv = n_words_per_cv_doc.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1


Probability of class DL: 0.6666666666666666
Probability of class CV: 0.3333333333333333


In [337]:

# Initiate parameters
parameters_dl = {unique_word:0 for unique_word in vocabulary}
parameters_cv = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_dl = dl_docs[word].sum()   # dl_docs already defined in a cell above
    p_word_given_dl = (n_word_given_dl + alpha) / (n_dl + (alpha*n_vocabulary))

    parameters_dl[word] = round(p_word_given_dl,2)
    #print(f"Probability of word :{word} in class doc:DL is { parameters_dl[word]}")
    
    n_word_given_cv = cv_docs[word].sum()   # cv_docs already defined in a cell above
    p_word_given_cv = (n_word_given_cv + alpha) / (n_cv + (alpha*n_vocabulary))
    parameters_cv[word] = round(p_word_given_cv,2)
    #print(f"Probability of word :{word} in class doc:CV is {parameters_cv[word]}")   
    #print(" ")

In [338]:
def classify_test_set_multinomial(doc):    
    
    #remove all punctuations
    doc = re.sub('\W', ' ', doc)
    
    #lower and split all words on white space
    doc = doc.lower().split()
    
    p_dl_given_doc = p_dl
    p_cv_given_doc = p_cv

    for word in doc:
        if word in parameters_dl:
            p_dl_given_doc *= parameters_dl[word]
            
        if word in parameters_cv:
            p_cv_given_doc *= parameters_cv[word]
    
    print("Multinomial Model:")
    print("Probability of document belonging to CV class:",(p_cv_given_doc*100))
    print("Probability of document belonging to DL class:",(p_dl_given_doc*100))
    
    if p_cv_given_doc > p_dl_given_doc:
        return 'CV'
    elif p_dl_given_doc > p_cv_given_doc:
        return 'DL'
    else:
        return 'Not able to estimate'

In [339]:
#Create a new column showing the result of our algorithm
test_set['predicted_multinomial'] = test_set['Doc'].apply(classify_test_set_multinomial)
test_set.head()



Multinomial Model:
Probability of document belonging to CV class: 3.495253333333334e-15
Probability of document belonging to DL class: 2.88e-17


Unnamed: 0,Doc,predicted_multinomial
0,Deep learning based computer vision methods ha...,CV


In [340]:
# Initial parameter for bernoulli_nb i.e multivariant
parameters_dl_NB = {unique_word:0 for unique_word in vocabulary}
parameters_cv_NB = {unique_word:0 for unique_word in vocabulary}
#pdb.set_trace()

# Calculate parameters
dl_docs_df = pd.DataFrame(dl_docs)
cv_docs_df = pd.DataFrame(cv_docs)

n_dl_docs_doc_count=len(dl_docs)
n_cv_docs_doc_count= len(cv_docs)


for word in vocabulary:
    n_doc_dl_word= (dl_docs_df[word] != 0).sum()
    
    #fraction of dl doc which has the word 
    p_word_given_dl_NB = (n_doc_dl_word+1)/(n_dl_docs_doc_count+2)
    parameters_dl_NB[word] = round(p_word_given_dl_NB,2)
    #print(f"Probability of word :{word} in class doc:DL is { parameters_dl_NB[word]}")
    
    #no of cv doc having the word
    n_doc_cv_word = (np.count_nonzero(cv_docs_df[word]))
    
    #fraction of cv doc which has the word 
    p_word_given_cv_NB = (n_doc_cv_word+1)/(n_cv_docs_doc_count+2)     
    parameters_cv_NB[word] = round(p_word_given_cv_NB,2)
    #print(f"Probability of word :{word} in class doc:DL is { parameters_cv_NB[word]}")
    #print(" ")


In [341]:
import math
def classify_test_set_multivariate(doc):    
 
    
    #remove all punctuations
    doc = re.sub('\W', ' ', doc)
    
    #lower and split all words on white space
    doc = doc.lower().split()
    
    #p(c=dl)
    p_dl_given_doc_NB = p_dl
    #p(c=cv)
    p_cv_given_doc_NB = p_cv
    
    #p(C=k)p(D|C=k) 
    # p(Di∣C)= bit*p(wt∣C)+(1−bit)*(1−p(wt∣C))
    
    #bit  is either 0 or 1 representing the absence or presence of word wt in the ith document.
    
    for word in vocabulary:
        # check if word is in message
        if word in doc:
            p_dl_given_doc_NB *= parameters_dl_NB[word]
            #pdb.set_trace()
            p_cv_given_doc_NB *= parameters_cv_NB[word]
        else:
            #word not in vocabulary ie bit=0
            p_dl_given_doc_NB *= (1-parameters_dl_NB[word])
            p_cv_given_doc_NB *= (1-parameters_cv_NB[word])       
         
    print("Multivariate Model:")
    print("Probability of document belonging to CV class:",p_cv_given_doc_NB*100)
    print("Probability of document belonging to DL class:",p_dl_given_doc_NB*100)
    
    if p_cv_given_doc_NB > p_dl_given_doc_NB:
        return 'CV'
    elif p_dl_given_doc_NB > p_cv_given_message_NB:
        return 'DL'
    else:
        return 'We are not able to estimate'

In [342]:
#Create a new column showing the result of our algorithm
time 
test_set['predicted_multivariate'] = test_set['Doc'].apply(classify_test_set_multivariate)
test_set.head()

Multivariate Model:
Probability of document belonging to CV class: 1.9739578287322718e-07
Probability of document belonging to DL class: 9.592326932761353e-12


Unnamed: 0,Doc,predicted_multinomial,predicted_multivariate
0,Deep learning based computer vision methods ha...,CV,CV
