<a href="https://colab.research.google.com/github/shailshree23/ML_Lab/blob/main/BayesianDecisionTheory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# 1. Define Prior Probabilities P(H)
P_Spam = 0.2    # Prior probability of an email being spam
P_NotSpam = 0.8 # Prior probability of an email being not spam

# Store them for easier access
priors = {
    'Spam': P_Spam,
    'NotSpam': P_NotSpam
}

# 2. Define Likelihoods P(D|H)
# P(word_present | Spam)
P_word_present_given_Spam = 0.7
P_word_not_present_given_Spam = 0.3

# P(word_present | NotSpam)
P_word_present_given_NotSpam = 0.1
P_word_not_present_given_NotSpam = 0.9

# Store them
likelihoods = {
    'Spam': {
        'word_present': P_word_present_given_Spam,
        'word_not_present': P_word_not_present_given_Spam
    },
    'NotSpam': {
        'word_present': P_word_present_given_NotSpam,
        'word_not_present': P_word_not_present_given_NotSpam
    }
}

# 3. Define Loss Function L(True State, Chosen Action)
# Rows: True State (Spam, NotSpam)
# Cols: Chosen Action (Classify as Spam, Classify as NotSpam)
loss_matrix = {
    'Spam': {
        'Classify_Spam': 0,
        'Classify_NotSpam': 5
    },
    'NotSpam': {
        'Classify_Spam': 1,
        'Classify_NotSpam': 0
    }
}

# Function to calculate Posterior Probability P(H|D) using Bayes' Theorem
def calculate_posterior(data_observed, prior_H, likelihood_D_given_H, likelihood_D_given_notH, prior_notH):
    # P(D) = P(D|H)P(H) + P(D|notH)P(notH) (Total Probability Theorem)
    P_D = (likelihood_D_given_H * prior_H) + (likelihood_D_given_notH * prior_notH)

    # P(H|D) = P(D|H) * P(H) / P(D)
    if P_D == 0: # Avoid division by zero
        return 0.0
    posterior = (likelihood_D_given_H * prior_H) / P_D
    return posterior, P_D

# Function to make a Bayesian decision for observed data
def make_bayesian_decision(observed_data_type):
    print(f"\n--- Making decision for: '{observed_data_type}' ---")

    # Calculate Posterior Probabilities for each hypothesis given the observed data
    # P(Spam | observed_data_type)
    posterior_Spam, P_D_spam = calculate_posterior(
        observed_data_type,
        priors['Spam'],
        likelihoods['Spam'][observed_data_type],
        likelihoods['NotSpam'][observed_data_type],
        priors['NotSpam']
    )

    # P(NotSpam | observed_data_type)
    # The P_D will be the same as calculated for posterior_Spam, as P(D) is for the specific observed data.
    # We can also calculate it explicitly for NotSpam to be clear:
    posterior_NotSpam, P_D_notspam = calculate_posterior(
        observed_data_type,
        priors['NotSpam'],
        likelihoods['NotSpam'][observed_data_type],
        likelihoods['Spam'][observed_data_type],
        priors['Spam']
    )

    print(f"P(Spam | {observed_data_type}) = {posterior_Spam:.4f}")
    print(f"P(NotSpam | {observed_data_type}) = {posterior_NotSpam:.4f}")

    # Calculate Expected Loss for each action
    # Action 1: Classify as Spam
    expected_loss_classify_Spam = (
        loss_matrix['Spam']['Classify_Spam'] * posterior_Spam +
        loss_matrix['NotSpam']['Classify_Spam'] * posterior_NotSpam
    )

    # Action 2: Classify as Not Spam
    expected_loss_classify_NotSpam = (
        loss_matrix['Spam']['Classify_NotSpam'] * posterior_Spam +
        loss_matrix['NotSpam']['Classify_NotSpam'] * posterior_NotSpam
    )

    print(f"Expected Loss if we Classify as Spam: {expected_loss_classify_Spam:.4f}")
    print(f"Expected Loss if we Classify as Not Spam: {expected_loss_classify_NotSpam:.4f}")

    # Choose the action with the minimum expected loss
    if expected_loss_classify_Spam < expected_loss_classify_NotSpam:
        print("\nDecision: Classify as SPAM (minimizes expected loss)")
    elif expected_loss_classify_Spam > expected_loss_classify_NotSpam:
        print("\nDecision: Classify as NOT SPAM (minimizes expected loss)")
    else:
        print("\nDecision: Indifferent (expected losses are equal)")

# --- Run the simulation for different observations ---

# Case 1: An email arrives with the "suspicious word" present
make_bayesian_decision('word_present')

# Case 2: An email arrives with the "suspicious word" NOT present
make_bayesian_decision('word_not_present')

# --- What if costs were different? ---
print("\n--- Let's try changing the Loss Matrix ---")
print("Scenario: It's much, much worse to miss a legitimate email (False Positive cost increases).")
loss_matrix_alt = {
    'Spam': {
        'Classify_Spam': 0,
        'Classify_NotSpam': 5 # Still bad to miss spam
    },
    'NotSpam': {
        'Classify_Spam': 10, # Much higher cost for false positive (marking good email as spam)
        'Classify_NotSpam': 0
    }
}
loss_matrix = loss_matrix_alt # Update the global loss_matrix for the next call

make_bayesian_decision('word_present')
make_bayesian_decision('word_not_present')


--- Making decision for: 'word_present' ---
P(Spam | word_present) = 0.6364
P(NotSpam | word_present) = 0.3636
Expected Loss if we Classify as Spam: 0.3636
Expected Loss if we Classify as Not Spam: 3.1818

Decision: Classify as SPAM (minimizes expected loss)

--- Making decision for: 'word_not_present' ---
P(Spam | word_not_present) = 0.0769
P(NotSpam | word_not_present) = 0.9231
Expected Loss if we Classify as Spam: 0.9231
Expected Loss if we Classify as Not Spam: 0.3846

Decision: Classify as NOT SPAM (minimizes expected loss)

--- Let's try changing the Loss Matrix ---
Scenario: It's much, much worse to miss a legitimate email (False Positive cost increases).

--- Making decision for: 'word_present' ---
P(Spam | word_present) = 0.6364
P(NotSpam | word_present) = 0.3636
Expected Loss if we Classify as Spam: 3.6364
Expected Loss if we Classify as Not Spam: 3.1818

Decision: Classify as NOT SPAM (minimizes expected loss)

--- Making decision for: 'word_not_present' ---
P(Spam | word_n

In [2]:
import numpy as np

# 1. Define Prior Probabilities P(H)
P_Spam = 0.2    # Prior probability of an email being spam
P_NotSpam = 0.8 # Prior probability of an email being not spam

priors = {
    'Spam': P_Spam,
    'NotSpam': P_NotSpam
}

# 2. Define Likelihoods P(D|H)
# P(word_present | Spam)
P_word_present_given_Spam = 0.7
P_word_not_present_given_Spam = 0.3

# P(word_present | NotSpam)
P_word_present_given_NotSpam = 0.1
P_word_not_present_given_NotSpam = 0.9

likelihoods = {
    'Spam': {
        'word_present': P_word_present_given_Spam,
        'word_not_present': P_word_not_present_given_Spam
    },
    'NotSpam': {
        'word_present': P_word_present_given_NotSpam,
        'word_not_present': P_word_not_present_given_NotSpam
    }
}

# 3. Define Loss Function L(True State, Chosen Action)
# Rows: True State (Spam, NotSpam)
# Cols: Chosen Action (Classify as Spam, Classify as NotSpam)
# Default loss matrix
loss_matrix = {
    'Spam': {
        'Classify_Spam': 0,
        'Classify_NotSpam': 5  # Cost of False Negative (missing spam)
    },
    'NotSpam': {
        'Classify_Spam': 1,   # Cost of False Positive (marking good email as spam)
        'Classify_NotSpam': 0
    }
}

# Function to calculate Posterior Probability P(H|D) using Bayes' Theorem
def calculate_posterior(data_observed_key, prior_H, likelihood_D_given_H, likelihood_D_given_notH, prior_notH):
    # P(D) = P(D|H)P(H) + P(D|notH)P(notH) (Total Probability Theorem)
    # The 'likelihood_D_given_notH' here refers to P(observed_data | the OTHER hypothesis)
    P_D = (likelihood_D_given_H * prior_H) + (likelihood_D_given_notH * prior_notH)

    if P_D == 0: # Avoid division by zero
        return 0.0, 0.0

    # P(H|D) = P(D|H) * P(H) / P(D)
    posterior = (likelihood_D_given_H * prior_H) / P_D
    return posterior, P_D

# Function to make a Bayesian decision for observed data
def make_bayesian_decision_interactive(observed_data_key, current_loss_matrix):
    print(f"\n--- Making decision for: '{observed_data_key.replace('_',' ')}' ---")

    # Calculate Posterior Probabilities for each hypothesis given the observed data
    # P(Spam | observed_data_key)
    posterior_Spam, P_D = calculate_posterior(
        observed_data_key,
        priors['Spam'],
        likelihoods['Spam'][observed_data_key],
        likelihoods['NotSpam'][observed_data_key],
        priors['NotSpam']
    )

    # P(NotSpam | observed_data_key)
    # Note: P(D) is the same for both posteriors once 'observed_data_key' is fixed.
    # We can derive P(NotSpam | D) = 1 - P(Spam | D) if there are only two hypotheses.
    posterior_NotSpam = 1.0 - posterior_Spam

    print(f"P(Spam | {observed_data_key.replace('_',' ')}) = {posterior_Spam:.4f}")
    print(f"P(NotSpam | {observed_data_key.replace('_',' ')}) = {posterior_NotSpam:.4f}")

    # Calculate Expected Loss for each action
    # Action 1: Classify as Spam
    expected_loss_classify_Spam = (
        current_loss_matrix['Spam']['Classify_Spam'] * posterior_Spam +
        current_loss_matrix['NotSpam']['Classify_Spam'] * posterior_NotSpam
    )

    # Action 2: Classify as Not Spam
    expected_loss_classify_NotSpam = (
        current_loss_matrix['Spam']['Classify_NotSpam'] * posterior_Spam +
        current_loss_matrix['NotSpam']['Classify_NotSpam'] * posterior_NotSpam
    )

    print(f"Expected Loss if we Classify as Spam: {expected_loss_classify_Spam:.4f}")
    print(f"Expected Loss if we Classify as Not Spam: {expected_loss_classify_NotSpam:.4f}")

    # Choose the action with the minimum expected loss
    if expected_loss_classify_Spam < expected_loss_classify_NotSpam:
        print("\nDecision: Classify as SPAM (minimizes expected loss)")
        return "Spam"
    elif expected_loss_classify_Spam > expected_loss_classify_NotSpam:
        print("\nDecision: Classify as NOT SPAM (minimizes expected loss)")
        return "Not Spam"
    else:
        print("\nDecision: Indifferent (expected losses are equal)")
        return "Indifferent"

# --- Main interactive loop ---
def main():
    print("Welcome to the Interactive Bayesian Email Classifier!")
    print("We're checking for a 'suspicious word' in an email.")

    current_loss_setting = loss_matrix # Start with default loss settings

    while True:
        print("\n--- Current Loss Settings ---")
        for true_state, actions in current_loss_setting.items():
            for action, cost in actions.items():
                print(f"  Cost for (True: {true_state}, Action: {action.replace('_',' ')}) = {cost}")

        print("\nWhat did you observe in the email?")
        print("1. 'Suspicious word' IS present")
        print("2. 'Suspicious word' is NOT present")
        print("3. Change loss matrix (costs of errors)")
        print("4. Exit")

        choice = input("Enter your choice (1/2/3/4): ").strip()

        if choice == '1':
            make_bayesian_decision_interactive('word_present', current_loss_setting)
        elif choice == '2':
            make_bayesian_decision_interactive('word_not_present', current_loss_setting)
        elif choice == '3':
            print("\n--- Change Loss Matrix ---")
            print("Enter new costs. If you enter nothing, the old value will be kept.")

            try:
                # Get new cost for False Negative (True Spam, Classify NotSpam)
                new_fn_cost_str = input(f"Current cost for (True: Spam, Action: Classify Not Spam - Missed Spam) is {current_loss_setting['Spam']['Classify_NotSpam']}. Enter new cost: ")
                if new_fn_cost_str:
                    current_loss_setting['Spam']['Classify_NotSpam'] = float(new_fn_cost_str)

                # Get new cost for False Positive (True NotSpam, Classify Spam)
                new_fp_cost_str = input(f"Current cost for (True: Not Spam, Action: Classify Spam - Good email to spam) is {current_loss_setting['NotSpam']['Classify_Spam']}. Enter new cost: ")
                if new_fp_cost_str:
                    current_loss_setting['NotSpam']['Classify_Spam'] = float(new_fp_cost_str)

                print("Loss matrix updated!")
            except ValueError:
                print("Invalid input for cost. Please enter a number.")
        elif choice == '4':
            print("Exiting. Goodbye!")
            break
        else:
            print("Invalid choice. Please enter 1, 2, 3, or 4.")

if __name__ == "__main__":
    main()

Welcome to the Interactive Bayesian Email Classifier!
We're checking for a 'suspicious word' in an email.

--- Current Loss Settings ---
  Cost for (True: Spam, Action: Classify Spam) = 0
  Cost for (True: Spam, Action: Classify NotSpam) = 5
  Cost for (True: NotSpam, Action: Classify Spam) = 1
  Cost for (True: NotSpam, Action: Classify NotSpam) = 0

What did you observe in the email?
1. 'Suspicious word' IS present
2. 'Suspicious word' is NOT present
3. Change loss matrix (costs of errors)
4. Exit
Enter your choice (1/2/3/4): 2

--- Making decision for: 'word not present' ---
P(Spam | word not present) = 0.0769
P(NotSpam | word not present) = 0.9231
Expected Loss if we Classify as Spam: 0.9231
Expected Loss if we Classify as Not Spam: 0.3846

Decision: Classify as NOT SPAM (minimizes expected loss)

--- Current Loss Settings ---
  Cost for (True: Spam, Action: Classify Spam) = 0
  Cost for (True: Spam, Action: Classify NotSpam) = 5
  Cost for (True: NotSpam, Action: Classify Spam) = 