<a href="https://colab.research.google.com/github/ronakvaghela454/CN6005-Portfolio/blob/main/Week7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
# Dataset 3 from the tutorial
data = {
    'EmploymentStatus': ['Employed', 'Unemployed', 'Employed', 'Employed', 'Unemployed'],
    'CreditHistory': ['Good', 'Bad', 'Good', 'Bad', 'Good'],
    'IncomeLevel': ['High', 'Low', 'Medium', 'Medium', 'Low'],
    'LoanApproved': ['Yes', 'No', 'Yes', 'No', 'Yes']  # Target variable
}

# Create DataFrame
df = pd.DataFrame(data)
print("Dataset loaded successfully:")
print(df)
print(f"\nDataset shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: {df.columns[-1]}")
print(f"Classes: {df['LoanApproved'].unique()}\n")

Dataset loaded successfully:
  EmploymentStatus CreditHistory IncomeLevel LoanApproved
0         Employed          Good        High          Yes
1       Unemployed           Bad         Low           No
2         Employed          Good      Medium          Yes
3         Employed           Bad      Medium           No
4       Unemployed          Good         Low          Yes

Dataset shape: (5, 4)
Features: ['EmploymentStatus', 'CreditHistory', 'IncomeLevel']
Target: LoanApproved
Classes: ['Yes' 'No']



In [None]:
# Separate features (X) and target (y)
X = df[['EmploymentStatus', 'CreditHistory', 'IncomeLevel']]  # Features
y = df['LoanApproved']  # Target

print("Features (X):")
print(X)
print(f"\nTarget (y):")
print(y)
print(f"\nTotal samples: {len(X)}")
print(f"Features per sample: {X.shape[1]}\n")

Features (X):
  EmploymentStatus CreditHistory IncomeLevel
0         Employed          Good        High
1       Unemployed           Bad         Low
2         Employed          Good      Medium
3         Employed           Bad      Medium
4       Unemployed          Good         Low

Target (y):
0    Yes
1     No
2    Yes
3     No
4    Yes
Name: LoanApproved, dtype: object

Total samples: 5
Features per sample: 3



In [None]:
def calculate_priors(target_series):
    """
    Calculate prior probabilities P(class)

    Formula: P(class) = count(class) / total_samples
    """
    priors = {}
    total_samples = len(target_series)

    # Count occurrences of each class
    class_counts = target_series.value_counts()

    print("Class counts:")
    for class_name, count in class_counts.items():
        probability = count / total_samples
        priors[class_name] = probability
        print(f"  {class_name}: {count}/{total_samples} = {probability:.4f}")

    return priors

# Calculate priors
priors = calculate_priors(y)
print(f"\nPrior probabilities: {priors}\n")


Class counts:
  Yes: 3/5 = 0.6000
  No: 2/5 = 0.4000

Prior probabilities: {'Yes': 0.6, 'No': 0.4}



In [None]:
def calculate_likelihoods(features_df, target_series):
    """
    Calculate likelihood probabilities P(feature | class)
    """
    likelihoods = defaultdict(lambda: defaultdict(dict))
    classes = target_series.unique()

    for class_name in classes:
        print(f"\nFor class '{class_name}':")

        # Get all rows where target = class_name
        class_rows = features_df[target_series == class_name]
        class_count = len(class_rows)

        print(f"  Total samples in class: {class_count}")

        for feature in features_df.columns:
            # Count occurrences of each feature value for this class
            feature_counts = class_rows[feature].value_counts()

            likelihoods[feature][class_name] = {}

            print(f"  {feature}:")
            for value, count in feature_counts.items():
                probability = count / class_count
                likelihoods[feature][class_name][value] = probability
                print(f"    P({feature}={value} | {class_name}) = {count}/{class_count} = {probability:.4f}")

    return likelihoods

# Calculate likelihoods
likelihoods = calculate_likelihoods(X, y)


For class 'Yes':
  Total samples in class: 3
  EmploymentStatus:
    P(EmploymentStatus=Employed | Yes) = 2/3 = 0.6667
    P(EmploymentStatus=Unemployed | Yes) = 1/3 = 0.3333
  CreditHistory:
    P(CreditHistory=Good | Yes) = 3/3 = 1.0000
  IncomeLevel:
    P(IncomeLevel=High | Yes) = 1/3 = 0.3333
    P(IncomeLevel=Medium | Yes) = 1/3 = 0.3333
    P(IncomeLevel=Low | Yes) = 1/3 = 0.3333

For class 'No':
  Total samples in class: 2
  EmploymentStatus:
    P(EmploymentStatus=Unemployed | No) = 1/2 = 0.5000
    P(EmploymentStatus=Employed | No) = 1/2 = 0.5000
  CreditHistory:
    P(CreditHistory=Bad | No) = 2/2 = 1.0000
  IncomeLevel:
    P(IncomeLevel=Low | No) = 1/2 = 0.5000
    P(IncomeLevel=Medium | No) = 1/2 = 0.5000


In [None]:
def get_feature_values_with_smoothing(likelihoods, features_df, alpha=1):
    """
    Get all possible values for each feature with Laplace smoothing
    alpha: smoothing parameter (usually 1 for Laplace smoothing)
    """
    all_values = {}

    for feature in features_df.columns:
        unique_values = set()

        # Get all values seen in training
        for class_name in likelihoods[feature]:
            unique_values.update(likelihoods[feature][class_name].keys())

        # Also add any values in the original data that might not be in likelihoods
        unique_values.update(features_df[feature].unique())

        all_values[feature] = list(unique_values)

        print(f"{feature}: {all_values[feature]}")

    return all_values

# Get all possible feature values
all_feature_values = get_feature_values_with_smoothing(likelihoods, X)
print(f"\nUnique feature values with smoothing: {all_feature_values}\n")

EmploymentStatus: ['Unemployed', 'Employed']
CreditHistory: ['Bad', 'Good']
IncomeLevel: ['High', 'Medium', 'Low']

Unique feature values with smoothing: {'EmploymentStatus': ['Unemployed', 'Employed'], 'CreditHistory': ['Bad', 'Good'], 'IncomeLevel': ['High', 'Medium', 'Low']}



In [None]:
def predict_naive_bayes(sample_features, priors, likelihoods, all_feature_values, alpha=1):
    """
    Predict class using Naive Bayes with Laplace smoothing
    """
    classes = list(priors.keys())
    probabilities = {}

    print(f"Sample to predict: {sample_features}")
    print("-" * 40)

    for class_name in classes:
        # Start with prior probability
        class_probability = priors[class_name]
        print(f"\nCalculating for class '{class_name}':")
        print(f"  Prior P({class_name}) = {class_probability:.4f}")

        # Multiply by likelihood of each feature
        for feature, value in sample_features.items():
            # Get total unique values for this feature
            num_unique_values = len(all_feature_values[feature])

            # Get count for this feature value in current class
            if value in likelihoods[feature][class_name]:
                feature_count = likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
            else:
                feature_count = 0

            # Apply Laplace smoothing
            smoothed_prob = (feature_count + alpha) / (sum(1 for _ in X[y == class_name]) + alpha * num_unique_values)

            class_probability *= smoothed_prob

            if value in likelihoods[feature][class_name]:
                print(f"  P({feature}={value} | {class_name}) = {smoothed_prob:.4f} (after smoothing)")
            else:
                print(f"  P({feature}={value} | {class_name}) = {smoothed_prob:.4f} (UNSEEN VALUE with smoothing)")

        probabilities[class_name] = class_probability
        print(f"  → Unnormalized probability: {class_probability:.6f}")

    # Normalize probabilities
    total = sum(probabilities.values())
    if total > 0:
        for class_name in probabilities:
            probabilities[class_name] /= total

    print("\n" + "-" * 40)
    print("FINAL PROBABILITIES (normalized):")
    for class_name, prob in probabilities.items():
        print(f"  P({class_name} | X) = {prob:.6f}")

    # Get prediction
    prediction = max(probabilities, key=probabilities.get)
    print(f"\nPREDICTION: {prediction}")

    return prediction, probabilities

In [None]:
# Test Case 1: Question 1 from Task 3
print("\n" + "=" * 40)
print("TEST CASE 1: Employed, Good, Medium")
print("=" * 40)

sample1 = {
    'EmploymentStatus': 'Employed',
    'CreditHistory': 'Good',
    'IncomeLevel': 'Medium'
}

prediction1, probs1 = predict_naive_bayes(sample1, priors, likelihoods, all_feature_values)

print("\n" + "=" * 40)
print("TEST CASE 2: Unemployed, Bad, Low")
print("=" * 40)

sample2 = {
    'EmploymentStatus': 'Unemployed',
    'CreditHistory': 'Bad',
    'IncomeLevel': 'Low'
}

prediction2, probs2 = predict_naive_bayes(sample2, priors, likelihoods, all_feature_values)


TEST CASE 1: Employed, Good, Medium
Sample to predict: {'EmploymentStatus': 'Employed', 'CreditHistory': 'Good', 'IncomeLevel': 'Medium'}
----------------------------------------

Calculating for class 'Yes':
  Prior P(Yes) = 0.6000
  P(EmploymentStatus=Employed | Yes) = 0.6000 (after smoothing)
  P(CreditHistory=Good | Yes) = 0.8000 (after smoothing)
  P(IncomeLevel=Medium | Yes) = 0.3333 (after smoothing)
  → Unnormalized probability: 0.096000

Calculating for class 'No':
  Prior P(No) = 0.4000
  P(EmploymentStatus=Employed | No) = 0.5000 (after smoothing)
  P(CreditHistory=Good | No) = 0.2000 (UNSEEN VALUE with smoothing)
  P(IncomeLevel=Medium | No) = 0.4167 (after smoothing)
  → Unnormalized probability: 0.016667

----------------------------------------
FINAL PROBABILITIES (normalized):
  P(Yes | X) = 0.852071
  P(No | X) = 0.147929

PREDICTION: Yes

TEST CASE 2: Unemployed, Bad, Low
Sample to predict: {'EmploymentStatus': 'Unemployed', 'CreditHistory': 'Bad', 'IncomeLevel': 'Lo

In [None]:
class NaiveBayesClassifier:
    """A complete Naive Bayes Classifier"""

    def __init__(self, alpha=1):
        self.alpha = alpha  # Laplace smoothing parameter
        self.priors = {}
        self.likelihoods = defaultdict(lambda: defaultdict(dict))
        self.feature_values = {}
        self.classes = []

    def fit(self, X, y):
        """Train the classifier"""
        print("Training Naive Bayes Classifier...")

        # Store classes
        self.classes = y.unique()

        # Calculate priors
        total_samples = len(y)
        for class_name in self.classes:
            class_count = sum(y == class_name)
            self.priors[class_name] = class_count / total_samples

        # Calculate likelihoods
        for class_name in self.classes:
            class_rows = X[y == class_name]
            class_count = len(class_rows)

            for feature in X.columns:
                feature_counts = class_rows[feature].value_counts()

                for value, count in feature_counts.items():
                    self.likelihoods[feature][class_name][value] = count / class_count

        # Store all possible feature values for Laplace smoothing
        for feature in X.columns:
            self.feature_values[feature] = list(X[feature].unique())

        print("Training complete!")
        return self

    def predict(self, X):
        """Make predictions"""
        predictions = []

        if isinstance(X, dict):  # Single sample
            X = pd.DataFrame([X])

        for _, row in X.iterrows():
            probs = {}

            for class_name in self.classes:
                prob = self.priors[class_name]

                for feature in X.columns:
                    value = row[feature]
                    num_values = len(self.feature_values[feature])

                    if value in self.likelihoods[feature][class_name]:
                        feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
                    else:
                        feature_count = 0

                    # Apply Laplace smoothing
                    smoothed_prob = (feature_count + self.alpha) / (
                        sum(1 for _ in X[y == class_name]) + self.alpha * num_values
                    )

                    prob *= smoothed_prob

                probs[class_name] = prob

            # Normalize
            total = sum(probs.values())
            if total > 0:
                for class_name in probs:
                    probs[class_name] /= total

            # Predict
            prediction = max(probs, key=probs.get)
            predictions.append(prediction)

        return predictions[0] if len(predictions) == 1 else predictions

    def predict_proba(self, X):
        """Return probability estimates"""
        if isinstance(X, dict):
            X = pd.DataFrame([X])

        probabilities = []

        for _, row in X.iterrows():
            probs = {}

            for class_name in self.classes:
                prob = self.priors[class_name]

                for feature in X.columns:
                    value = row[feature]
                    num_values = len(self.feature_values[feature])

                    if value in self.likelihoods[feature][class_name]:
                        feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
                    else:
                        feature_count = 0

                    smoothed_prob = (feature_count + self.alpha) / (
                        sum(1 for _ in X[y == class_name]) + self.alpha * num_values
                    )

                    prob *= smoothed_prob

                probs[class_name] = prob

            # Normalize
            total = sum(probs.values())
            if total > 0:
                for class_name in probs:
                    probs[class_name] /= total

            probabilities.append(probs)

        return probabilities[0] if len(probabilities) == 1 else probabilities


In [None]:
# Create and train classifier
classifier = NaiveBayesClassifier()
classifier.fit(X, y)

print(f"\nPriors: {classifier.priors}")
print(f"Classes: {classifier.classes}")

# Make predictions
print("\nMaking predictions with classifier class:")
print("-" * 40)

samples_to_predict = [
    {'EmploymentStatus': 'Employed', 'CreditHistory': 'Good', 'IncomeLevel': 'Medium'},
    {'EmploymentStatus': 'Unemployed', 'CreditHistory': 'Bad', 'IncomeLevel': 'Low'},
    {'EmploymentStatus': 'Employed', 'CreditHistory': 'Bad', 'IncomeLevel': 'High'}
]

for i, sample in enumerate(samples_to_predict, 1):
    print(f"\nSample {i}: {sample}")
    prediction = classifier.predict(sample)
    probabilities = classifier.predict_proba(sample)
    print(f"  Prediction: {prediction}")
    print(f"  Probabilities: {probabilities}")


Training Naive Bayes Classifier...
Training complete!

Priors: {'Yes': 0.6, 'No': 0.4}
Classes: ['Yes' 'No']

Making predictions with classifier class:
----------------------------------------

Sample 1: {'EmploymentStatus': 'Employed', 'CreditHistory': 'Good', 'IncomeLevel': 'Medium'}
  Prediction: Yes
  Probabilities: {'Yes': 0.8520710059171597, 'No': 0.14792899408284027}

Sample 2: {'EmploymentStatus': 'Unemployed', 'CreditHistory': 'Bad', 'IncomeLevel': 'Low'}
  Prediction: No
  Probabilities: {'Yes': 0.19354838709677416, 'No': 0.8064516129032259}

Sample 3: {'EmploymentStatus': 'Employed', 'CreditHistory': 'Bad', 'IncomeLevel': 'High'}
  Prediction: No
  Probabilities: {'Yes': 0.47368421052631576, 'No': 0.5263157894736843}


  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y == class_name]) + self.alpha * num_values
  feature_count = self.likelihoods[feature][class_name][value] * sum(1 for _ in X[y == class_name])
  sum(1 for _ in X[y =

In [None]:
# Manual calculation for first sample
sample = {'EmploymentStatus': 'Employed', 'CreditHistory': 'Good', 'IncomeLevel': 'Medium'}

print(f"\nSample: {sample}")
print("\nManual calculation:")

# Priors
P_Yes = 3/5
P_No = 2/5
print(f"\nP(Yes) = {P_Yes}")
print(f"P(No) = {P_No}")

# Likelihoods for Yes
print("\nFor Yes class:")
P_Emp_Yes = 2/3
P_Good_Yes = 3/3
P_Medium_Yes = 1/3
print(f"P(Employed|Yes) = {P_Emp_Yes:.4f}")
print(f"P(Good|Yes) = {P_Good_Yes:.4f}")
print(f"P(Medium|Yes) = {P_Medium_Yes:.4f}")

# Likelihoods for No
print("\nFor No class:")
P_Emp_No = 1/2
P_Good_No = 0/2
P_Medium_No = 1/2
print(f"P(Employed|No) = {P_Emp_No:.4f}")
print(f"P(Good|No) = {P_Good_No:.4f}")
print(f"P(Medium|No) = {P_Medium_No:.4f}")

# Calculate
P_Yes_X = P_Yes * P_Emp_Yes * P_Good_Yes * P_Medium_Yes
P_No_X = P_No * P_Emp_No * P_Good_No * P_Medium_No

# Normalize
total = P_Yes_X + P_No_X
P_Yes_X_norm = P_Yes_X / total if total > 0 else 0
P_No_X_norm = P_No_X / total if total > 0 else 0

print(f"\nP(Yes|X) ∝ {P_Yes_X:.6f} → Normalized: {P_Yes_X_norm:.6f}")
print(f"P(No|X) ∝ {P_No_X:.6f} → Normalized: {P_No_X_norm:.6f}")

if P_Yes_X_norm > P_No_X_norm:
    print("Prediction: Yes")
else:
    print("Prediction: No")


Sample: {'EmploymentStatus': 'Employed', 'CreditHistory': 'Good', 'IncomeLevel': 'Medium'}

Manual calculation:

P(Yes) = 0.6
P(No) = 0.4

For Yes class:
P(Employed|Yes) = 0.6667
P(Good|Yes) = 1.0000
P(Medium|Yes) = 0.3333

For No class:
P(Employed|No) = 0.5000
P(Good|No) = 0.0000
P(Medium|No) = 0.5000

P(Yes|X) ∝ 0.133333 → Normalized: 1.000000
P(No|X) ∝ 0.000000 → Normalized: 0.000000
Prediction: Yes
