In [10]:
import string
from collections import defaultdict
import math

# Sample dataset
data = [ 
    ("Buy cheap medicines now", "spam"), 
    ("Limited offer just for you", "spam"), 
    ("Call me when you get time", "ham"), 
    ("Let's catch up over lunch", "ham"), 
    ("Win cash prize easily", "spam"), 
    ("Don't forget to submit the report", "ham") 
]

# 1. Preprocessing function
def preprocess(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()  # Tokenize
    #print(tokens)
    return tokens

# 2. Training function
def train_naive_bayes(data):
    class_word_counts = defaultdict(lambda: defaultdict(int))  # word frequency per class
    class_counts = defaultdict(int)  # total message counts per class
    word_totals = defaultdict(int)   # total word count per class
    vocabulary = set()

    for text, label in data:
        class_counts[label] += 1
        tokens = preprocess(text)
        for word in tokens:
            class_word_counts[label][word] += 1
            word_totals[label] += 1
            vocabulary.add(word)
    
    total_docs = sum(class_counts.values())
    
    # Calculate priors and likelihoods with Laplace smoothing
    priors = {label: math.log(class_counts[label] / total_docs) for label in class_counts}
    likelihoods = {
        label: {
            word: math.log((class_word_counts[label][word] + 1) / (word_totals[label] + len(vocabulary)))
            for word in vocabulary
        }
        for label in class_counts
    }
    #print(likelihoods)
    print(priors)
    return priors, likelihoods, vocabulary, class_counts, word_totals

# 3. Prediction function
def predict(text, priors, likelihoods, vocabulary, class_counts, word_totals):
    tokens = preprocess(text)
    scores = {}

    for label in class_counts:
        score = priors[label]
        for word in tokens:
            if word in vocabulary:
                score += likelihoods[label][word]
            else:
                # Apply Laplace smoothing for unknown words
                score += math.log(1 / (word_totals[label] + len(vocabulary)))
        scores[label] = score
    
    return max(scores, key=scores.get)

# Train the model
priors, likelihoods, vocabulary, class_counts, word_totals = train_naive_bayes(data)

# Test with example input
test_message = "Get your cash prize now"
prediction = predict(test_message, priors, likelihoods, vocabulary, class_counts, word_totals)
print(f"Message: '{test_message}'")
print(f"Predicted class: {prediction}")


{'spam': -0.6931471805599453, 'ham': -0.6931471805599453}
Message: 'Get your cash prize now'
Predicted class: spam


In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
#from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder

# Load the dataset
#data = load_iris()
df=pd.read_csv('iris.csv')
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values
#X = data.data  # shape: (150, 4)
#y = data.target  # shape: (150,)
#target_names = data.target_names  # ['setosa', 'versicolor', 'virginica']

le = LabelEncoder()
y = le.fit_transform(y)
# Split data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Train Gaussian Naive Bayes
class GaussianNB:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}

        for cls in self.classes:
            X_c = X[y == cls]
            self.mean[cls] = np.mean(X_c, axis=0)
            self.var[cls] = np.var(X_c, axis=0)
            self.priors[cls] = X_c.shape[0] / X.shape[0]

    def gaussian_pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(- ((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = []
            for cls in self.classes:
                prior = np.log(self.priors[cls])
                conditional = np.sum(np.log(self.gaussian_pdf(cls, x)))
                posterior = prior + conditional
                posteriors.append(posterior)
            y_pred.append(np.argmax(posteriors))
        return np.array(y_pred)

# Train and predict
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 2: Evaluate accuracy
accuracy = np.sum(y_pred == y_test) / len(y_test)
print("Predictions:", y_pred)
print("Actual     :", y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")



Predictions: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Actual     : [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Accuracy: 100.00%


In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
#import seaborn as sns
#import matplotlib.pyplot as plt

# 1. Load the dataset
data = load_breast_cancer()
X = data.data  # shape: (569, 30)
y = data.target  # 0 = malignant, 1 = benign

# 2. Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train Gaussian Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# 4. Make predictions
y_pred = model.predict(X_test)

# 5. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%\n")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))

print(confusion_matrix(y_test, y_pred))
# Confusion matrix



Accuracy: 97.37%

Classification Report:
              precision    recall  f1-score   support

   malignant       1.00      0.93      0.96        43
      benign       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  3]
 [ 0 71]]
