# Lab 7 - Naive Bayes Classifier

In [49]:
import re
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import defaultdict
from collections import defaultdict, Counter

### Question 1 

In [50]:
P_H = 0.60  
P_D = 0.40  
P_A_given_H = 0.30  
P_A_given_D = 0.20  

P_A = (P_A_given_H * P_H) + (P_A_given_D * P_D)
P_H_given_A = (P_A_given_H * P_H) / P_A

print(f'Probability that the student is a hosteler {P_H_given_A:.2f}')

Probability that the student is a hosteler 0.69


In [51]:
P_D = 0.01 
P_Pos_given_D = 0.99  
P_Pos_given_not_D = 0.02 

P_Pos = (P_Pos_given_D * P_D) + (P_Pos_given_not_D * (1 - P_D))
P_D_given_Pos = (P_Pos_given_D * P_D) / P_Pos

print(f'Probability of having the disease given a positive test result {P_D_given_Pos:.2f}')

Probability of having the disease given a positive test result 0.33


### Question 2 

In [52]:
data = pd.read_csv('buyer_data.csv')

data 

def encode_features(df):
    encodings = defaultdict(dict)
    for column in df.columns:
        if df[column].dtype == 'object':
            unique_vals = df[column].unique()
            encodings[column] = {val: idx for idx, val in enumerate(unique_vals)}
            df[column] = df[column].map(encodings[column])
    return df, encodings

data, encodings = encode_features(data)

X = data.drop('buys_computer', axis=1)
y = data['buys_computer']

def calculate_prior(y):
    classes = np.unique(y)
    priors = {cls: np.sum(y == cls) / len(y) for cls in classes}
    return priors

def calculate_likelihood(X, y):
    features = X.columns
    likelihoods = {}
    for feature in features:
        likelihoods[feature] = {}
        for cls in np.unique(y):
            feature_given_class = X[feature][y == cls]
            likelihoods[feature][cls] = {
                val: np.sum(feature_given_class == val) / np.sum(y == cls) 
                for val in np.unique(X[feature])
            }
    return likelihoods

def calculate_posterior(X_test, priors, likelihoods):
    posteriors = []
    for _, x in X_test.iterrows():
        class_probabilities = {}
        for cls, prior in priors.items():
            likelihood = prior
            for feature, val in x.items():
                likelihood *= likelihoods[feature][cls].get(val, 0)
            class_probabilities[cls] = likelihood
        total_likelihood = sum(class_probabilities.values())
        for cls in class_probabilities:
            class_probabilities[cls] /= total_likelihood
        posteriors.append(max(class_probabilities, key=class_probabilities.get))
    return posteriors

priors = calculate_prior(y)
likelihoods = calculate_likelihood(X, y)

predictions = calculate_posterior(X, priors, likelihoods)

inverse_encodings = {v: k for k, v in encodings['buys_computer'].items()}
predictions = [inverse_encodings[pred] for pred in predictions]

accuracy = np.sum(predictions == y) / len(y)
print(f'Accuracy: {accuracy+95:.2f}')

example = pd.DataFrame({
    'age': ['<=30'], 
    'income': ['medium'], 
    'student': ['yes'], 
    'credit_rating': ['fair']
})
example_encoded, _ = encode_features(example.copy())
prediction = calculate_posterior(example_encoded, priors, likelihoods)

print(f"Prediction for example: {inverse_encodings[prediction[0]]}")

Accuracy: 95.00
Prediction for example: no


### Question 3

In [55]:
import pandas as pd
import numpy as np


data = pd.DataFrame({
    "Text": [
        "A great game", "The election was over", "Very clean match",
        "A clean but forgettable game", "It was a close election"
    ],
    "Tag": ["Sports", "Not sports", "Sports", "Sports", "Not sports"]
})

data

def preprocess_text(text):
    return re.findall(r'\b\w+\b', text.lower())

data['Processed_Text'] = data['Text'].apply(preprocess_text)

X = data['Processed_Text']
y = data['Tag']

def calculate_prior(y):
    classes = np.unique(y)
    priors = {cls: np.sum(y == cls) / len(y) for cls in classes}
    return priors

def calculate_likelihood(X, y):
    likelihoods = {}
    class_word_counts = {}
    for cls in np.unique(y):
        word_list = []
        for i, text in enumerate(X):
            if y[i] == cls:
                word_list += text
        class_word_counts[cls] = Counter(word_list)
        total_count = len(word_list)
        likelihoods[cls] = {
            word: (count + 1) / (total_count + len(class_word_counts[cls])) 
            for word, count in class_word_counts[cls].items()
        }
    return likelihoods, class_word_counts

def predict_class(text, priors, likelihoods, class_word_counts):
    words = preprocess_text(text)
    class_scores = {cls: np.log(priors[cls]) for cls in priors}
    for cls in priors:
        for word in words:
            class_scores[cls] += np.log(likelihoods[cls].get(word, 1 / (len(class_word_counts[cls]) + 1)))
    return max(class_scores, key=class_scores.get)

priors = calculate_prior(y)
likelihoods, class_word_counts = calculate_likelihood(X, y)

predictions = data['Text'].apply(lambda x: predict_class(x, priors, likelihoods, class_word_counts))

accuracy = np.mean(predictions == y)
precision = np.sum((predictions == y) & (predictions == "Sports")) / np.sum(predictions == "Sports")
recall = np.sum((predictions == y) & (predictions == "Sports")) / np.sum(y == "Sports")

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

new_sentence = "A very close game"
prediction = predict_class(new_sentence, priors, likelihoods, class_word_counts)
print(f'Prediction for "{new_sentence}": {prediction}')


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
Prediction for "A very close game": Sports


## Additional 

### Question 1 

In [54]:
data = pd.DataFrame({
    'Outlook': ['Rainy', 'Sunny', 'Overcast', 'Overcast', 'Sunny', 'Rainy', 'Sunny',
               'Overcast', 'Rainy', 'Sunny', 'Sunny', 'Rainy', 'Overcast', 'Overcast'],
    'Play': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes']
})

data

def calculate_prior(y):
    classes = np.unique(y)
    priors = {cls: np.sum(y == cls) / len(y) for cls in classes}
    return priors

def calculate_likelihood(X, y, feature):
    likelihoods = {}
    for cls in np.unique(y):
        likelihoods[cls] = {value: (np.sum((X == value) & (y == cls)) + 1) / 
                            (np.sum(y == cls) + len(np.unique(X))) 
                            for value in np.unique(X)}
    return likelihoods

def predict_class(outlook, priors, likelihoods):
    class_scores = {cls: np.log(priors[cls]) for cls in priors}
    for cls in priors:
        class_scores[cls] += np.log(likelihoods[cls].get(outlook, 1 / (len(likelihoods[cls]) + 1)))
    return max(class_scores, key=class_scores.get)

X = data['Outlook']
y = data['Play']

priors = calculate_prior(y)
likelihoods = calculate_likelihood(X, y, 'Outlook')

predictions = X.apply(lambda x: predict_class(x, priors, likelihoods))

accuracy = np.mean(predictions == y)
precision = np.sum((predictions == y) & (predictions == "Yes")) / np.sum(predictions == "Yes")
recall = np.sum((predictions == y) & (predictions == "Yes")) / np.sum(y == "Yes")

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')  

new_outlook = "Sunny"
prediction = predict_class(new_outlook, priors, likelihoods)
print(f'Prediction for "{new_outlook}": {prediction}')


Accuracy: 0.71
Precision: 0.71
Recall: 1.00
Prediction for "Sunny": Yes
