* [Naive-Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)

In [1]:
import math
import random
import pandas as pd
import numpy as np

$p(Y \mid \mathbf{x}) = \frac{p(Y) \ p(\mathbf{x} \mid Y)}{p(\mathbf{x})} \propto p(\mathbf{x} \mid Y)$   - Eq1


$\underset{k \in \{1, \ldots, K\}}{\operatorname{argmax}} \ p(C_k) \displaystyle\prod_{i=1}^n p(x_i \mid C_k)$  - Eq2

In [22]:
# Load data using pandas
filename = '~/Downloads/diabetes.csv'  # Add the correct file path
df = pd.read_csv(filename)
mydata = df.values.tolist()

# Encode classes and convert attributes to float
#mydata = encode_class(mydata)
#for i in range(len(mydata)):
#    for j in range(len(mydata[i]) - 1):
#        mydata[i][j] = float(mydata[i][j])

In [23]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Scikit implementation

In [86]:
inputs = list(df.keys())[0:-1]
print(inputs)
output = list(df.keys())[-1]
print(output)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Outcome


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Example dataset
X = df[inputs].values.tolist()
y = df[output].values.tolist()

# Step 1: Preprocessing (example for categorical data encoding)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert labels to numeric if necessary

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Step 3: Choose and train a Naïve Bayes model
# Use Gaussian Naïve Bayes if features are continuous
model = GaussianNB()

# If you have categorical data, you might use Multinomial or Bernoulli Naïve Bayes
# model = MultinomialNB() 
# model = BernoulliNB()

model.fit(X_train, y_train)

# Step 4: Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.79      0.80       151
           1       0.62      0.66      0.64        80

    accuracy                           0.74       231
   macro avg       0.72      0.73      0.72       231
weighted avg       0.75      0.74      0.75       231

[[119  32]
 [ 27  53]]


In [89]:

# # -----------------------------
# # 1) Dataset
# # -----------------------------
# mydata = [
#     [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
#     [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0],
#     [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
#     [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0],
#     [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0]
# ]

# -----------------------------
# 2) Helper functions
# -----------------------------
def train_test_split(dataset, test_size=0.4, seed=42):
    """
    Splits dataset into train and test sets based on test_size ratio.
    """
    random.seed(seed)
    data_copy = dataset[:]
    random.shuffle(data_copy)
    split_index = int(len(data_copy) * (1 - test_size))
    return data_copy[:split_index], data_copy[split_index:]

def separate_by_class(dataset):
    """
    Splits the dataset into a dictionary keyed by class value (0 or 1),
    each containing only those rows.
    """
    separated = {}
    for row in dataset:
        class_value = row[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(row)
    return separated

def mean(values):
    return sum(values) / float(len(values))

def stdev(values):
    """
    Compute the standard deviation of a list of numbers.
    Use (N-1) in the denominator to match sample stdev (unbiased).
    """
    avg = mean(values)
    variance = sum([(x - avg)**2 for x in values]) / (len(values) - 1)
    return math.sqrt(variance) if len(values) > 1 else 0.0

def summarize_dataset(dataset):
    """
    For each attribute (column) in the dataset (except the last, which is the label),
    calculate the mean and standard deviation.
    Returns a list of (mean, stdev) tuples for each column.
    """
    summaries = []
    # Exclude the label column at the end
    num_features = len(dataset[0]) - 1  
    for col in range(num_features):
        column_values = [row[col] for row in dataset]
        col_mean = mean(column_values)
        col_stdev = stdev(column_values)
        summaries.append((col_mean, col_stdev))
    return summaries

def summarize_by_class(dataset):
    """
    Splits dataset by class, then computes the summary stats for each class separately.
    Returns a dict of class_value -> list of (mean, stdev) for each feature.
    """
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

def calculate_gaussian_probability(x, mean, stdev):
    """
    Calculate the probability of 'x' for a normal distribution with the given mean and stdev.
    """
    if stdev == 0:
        # To handle the case of stdev=0, we can return 1 if x == mean, else a very small number.
        return 1.0 if x == mean else 1e-9
    
    exponent = math.exp(-((x - mean)**2 / (2 * stdev**2)))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

def calculate_class_probabilities(summaries, row):
    """
    Given a row (of features), compute the posterior probability for each class
    using the precomputed summaries (mean, stdev) for each feature of each class.
    """
    total_rows = 0
    for class_value, class_summaries in summaries.items():
        total_rows += len(class_summaries)  # Incorrect: We need the row count, not columns count
    # Correction: total_rows should be the sum of the number of data rows for each class
    # But we actually need the prior probability from the original dataset counts.
    # Let's keep track of class_counts separately. We can store it in 'summaries' or compute on the fly.

    # Alternatively, we can store a separate structure for prior probabilities. 
    # For simplicity, let's store them in 'class_counts' once we know how many data points belong to each class.
    pass

# To handle prior probabilities properly, let's rewrite `summarize_by_class` to also return class_counts:
def summarize_by_class_with_counts(dataset):
    """
    Returns:
    {
      class_value: {
         'summaries': [(mean1, stdev1), (mean2, stdev2), ...],
         'count': number_of_rows_in_this_class
      }
    }
    """
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, rows in separated.items():
        summaries[class_value] = {
            'summaries': summarize_dataset(rows),
            'count': len(rows)
        }
    return summaries

def calculate_class_probabilities(summaries, row, total_rows):
    """
    Calculate P(class|data) for each class (not strictly normalized).
    We'll compare the relative probabilities.
    """
    probabilities = {}
    
    for class_value, class_info in summaries.items(): #iterates for 2 times
        # Prior probability P(class)
        class_count = class_info['count']
        p_class = class_count / float(total_rows)
        
        probabilities[class_value] = p_class  # start with prior in Eq2 P(C_k)

        # Likelihood for each feature
        # in Eq2 P(X1|C1) x P(X2|C1) x P(X3|C1) ...x P(X8|C1) 1st loop
        # in Eq2 P(X1|C2) x P(X2|C2) x P(X3|C2) ...x P(X8|C2) 2nd loop
        for i, (mean_i, stdev_i) in enumerate(class_info['summaries']): #iterates for 8 times
            x = row[i]
            p_x_given_class = calculate_gaussian_probability(x, mean_i, stdev_i)
            probabilities[class_value] *= p_x_given_class  
    
    return probabilities

def predict(summaries, row, total_rows):
    """
    Given a row, compute the posterior probability for each class and
    return the class with the highest probability.
    """
    probabilities = calculate_class_probabilities(summaries, row, total_rows)
    print(probabilities)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

def get_predictions(summaries, test, total_rows):
    """
    Predict each row in the test set.
    """
    predictions = []
    for row in test:
        output = predict(summaries, row, total_rows)
        predictions.append(output)
    return predictions

def accuracy_metric(actual, predicted):
    """
    Compute the accuracy as (number correct / total) * 100.
    """
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return (correct / float(len(actual))) * 100.0

# -----------------------------
# 3) Putting it all together
# -----------------------------
def naive_bayes_train_test(dataset, test_ratio=0.4):
    # 1) Split into train/test
    train, test = train_test_split(dataset, test_size=test_ratio)

    # 2) Summarize training dataset
    #    (this will give means, stdevs per feature per class + class counts)
    summaries = summarize_by_class_with_counts(train)
    print(summaries)

    # 3) Make predictions on test dataset
    total_rows_in_train = len(train)  # used for prior probabilities
    predictions = []
    actual = []
    for row in test:
        # The last column is the actual label
        actual_label = row[-1]
        actual.append(actual_label)

        # Predict
        predicted_label = predict(summaries, row, total_rows_in_train)
        predictions.append(predicted_label)
    
    # 4) Calculate accuracy
    acc = accuracy_metric(actual, predictions)

    return {
        'train_size': len(train),
        'test_size': len(test),
        'predictions': predictions,
        'actual': actual,
        'accuracy': acc
    }

# -----------------------------
# 4) Run the classifier on the sample data
# -----------------------------
if __name__ == "__main__":
    results = naive_bayes_train_test(mydata, test_ratio=0.3)
    print("Train Size:", results['train_size'])
    print("Test Size:", results['test_size'])
    print("Actual:     ", results['actual'])
    print("Predictions:", results['predictions'])
    print(f"Accuracy: {results['accuracy']:.2f}%")


{0.0: {'summaries': [(3.2341040462427744, 3.0436617783563014), (110.56647398843931, 26.61638217874151), (68.04624277456648, 18.589326851687254), (19.378612716763005, 15.033431511049075), (68.40751445086705, 95.2896657940275), (30.219942196531786, 7.820362199040739), (0.4399450867052022, 0.2962485049513914), (31.320809248554912, 11.455191870610197)], 'count': 346}, 1.0: {'summaries': [(4.816753926701571, 3.7067409344218696), (141.3193717277487, 32.4943329734997), (71.66492146596859, 20.192045720742033), (22.31413612565445, 17.770636903620538), (103.92670157068063, 142.27465975917633), (35.33612565445027, 7.141775075303977), (0.5361884816753928, 0.3479684136804108), (36.968586387434556, 11.009042101435623)], 'count': 191}}
{0.0: 1.4001382334414657e-15, 1.0: 1.974160575947222e-15}
{0.0: 2.5169927639478417e-12, 1.0: 6.499863855587927e-14}
{0.0: 4.005555727214499e-13, 1.0: 8.032436433079071e-14}
{0.0: 1.5058580913796468e-12, 1.0: 4.595324795791697e-14}
{0.0: 2.338911794221136e-13, 1.0: 1.62