# Naïve Bayes Classifier

In [1]:
import pandas as pd

In [2]:
csv = pd.read_csv("naive_bayes.csv")
csv

Unnamed: 0,sentence,class
0,The quick brown fox jumps over the lazy dog.,A
1,"Lorem ipsum dolor sit amet, consectetur adipis...",B


In [3]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation (you can modify this as needed)
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

In [4]:
processed_sentences = []
for sentence in csv['sentence']:
    new_sentence = preprocess_text(sentence)
    processed_sentences.append(new_sentence)
csv['processed_sentence'] = processed_sentences

csv

Unnamed: 0,sentence,class,processed_sentence
0,The quick brown fox jumps over the lazy dog.,A,the quick brown fox jumps over the lazy dog
1,"Lorem ipsum dolor sit amet, consectetur adipis...",B,lorem ipsum dolor sit amet consectetur adipisc...


In [5]:
vocab = []
count_a = 0
count_b = 0
for i, sentence in enumerate(csv["processed_sentence"]):
    for word in sentence.split():
        
        # Count occurrences of words in class A and class
        if csv["class"][i] == "A":
            count_a += 1
        else:
            count_b += 1
                
        # Add word to vocabulary if not already present
        if word not in vocab:
            vocab.append(word)

vocab

['the',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'lazy',
 'dog',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet',
 'consectetur',
 'adipiscing',
 'elit']

In [6]:
count_a, count_b

(9, 8)

In [7]:
pa = 0
pb = 0

for value in csv["class"]:
    if value == "A":
        pa += 1
    else:
        pb += 1
        
pa = pa / len(csv)
pb = pb / len(csv)
print("P(A):", pa)
print("P(B):", pb)

P(A): 0.5
P(B): 0.5


In [8]:
from collections import defaultdict

class_a = defaultdict(int)
class_b = defaultdict(int)

for word in vocab:
    for i, sentence in enumerate(csv["processed_sentence"]):
        if word in sentence.split():
            if csv["class"][i] == "A":
                class_a[word] += 1
            else:
                class_b[word] += 1
                
class_a, class_b

(defaultdict(int,
             {'the': 1,
              'quick': 1,
              'brown': 1,
              'fox': 1,
              'jumps': 1,
              'over': 1,
              'lazy': 1,
              'dog': 1}),
 defaultdict(int,
             {'lorem': 1,
              'ipsum': 1,
              'dolor': 1,
              'sit': 1,
              'amet': 1,
              'consectetur': 1,
              'adipiscing': 1,
              'elit': 1}))

In [9]:
for word in vocab:
    # Apply Laplace smoothing
    class_a[word] = (class_a[word] + 1) / (count_a + len(vocab) + 1)
    class_b[word] = (class_b[word] + 1) / (count_b + len(vocab) + 1)
    
class_a, class_b

(defaultdict(int,
             {'the': 0.07692307692307693,
              'quick': 0.07692307692307693,
              'brown': 0.07692307692307693,
              'fox': 0.07692307692307693,
              'jumps': 0.07692307692307693,
              'over': 0.07692307692307693,
              'lazy': 0.07692307692307693,
              'dog': 0.07692307692307693,
              'lorem': 0.038461538461538464,
              'ipsum': 0.038461538461538464,
              'dolor': 0.038461538461538464,
              'sit': 0.038461538461538464,
              'amet': 0.038461538461538464,
              'consectetur': 0.038461538461538464,
              'adipiscing': 0.038461538461538464,
              'elit': 0.038461538461538464}),
 defaultdict(int,
             {'lorem': 0.08,
              'ipsum': 0.08,
              'dolor': 0.08,
              'sit': 0.08,
              'amet': 0.08,
              'consectetur': 0.08,
              'adipiscing': 0.08,
              'elit': 0.08,
            

In [10]:
def predict(sentence):
    sentence = preprocess_text(sentence)
    words = sentence.split()
    
    # Calculate log probabilities
    prob_a = 0
    prob_b = 0
    
    for word in words:
        if word in class_a:
            prob_a += class_a[word]
        else:
            prob_a += 1 / (count_a + len(vocab) + 1)  # Laplace smoothing
        
        if word in class_b:
            prob_b += class_b[word]
        else:
            prob_b += 1 / (count_b + len(vocab) + 1)  # Laplace smoothing
            
    # Apply prior probabilities
    prob_a *= pa
    prob_b *= pb
    
    return "A" if prob_a > prob_b else "B"  

In [19]:
predict("I am a lazy cow")

'A'

In [21]:
predict("vini vidi vici")

'B'