<h1>CS 41344 - Natural Language Processing</h1>
<h2 style = 'color:yellow'>Assignment - 5: Naive Bayes</h2>

<h3 style = 'color:lightgreen'>Assignment - 5.0 Import Libraries</h3>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
def calculate_conditional_probabilities(df, a_values, b_value):
    """
    Calculate the conditional probabilities P(B=b_value|A=a_value)

    Parameters:
    - df: DataFrame containing the data
    - a_values: List of unique values in column A for which to calculate the conditional probabilities
    - b_value: The specific value in column B for which to calculate the conditional probability

    Returns:
    A dictionary with keys as A values and values as the corresponding conditional probabilities.
    """
    probabilities = {}
    for a_value in a_values:
        # Filtering df for the specific A value
        df_filtered_a = df[df['A'] == a_value]
        # Counting occurrences where B equals b_value given A equals a_value
        count_b_given_a = len(df_filtered_a[df_filtered_a['B'] == b_value])
        # Calculating the probability
        probability = count_b_given_a / len(df_filtered_a) if len(df_filtered_a) > 0 else 0
        probabilities[a_value] = probability
    return probabilities

In [11]:
def calculate_conditional_probabilities_laplace(df, a_values, b_value, k=1):
    """
    Calculate the conditional probabilities with Laplace smoothing P(B=b_value|A=a_value) for each a_value in a_values.

    Parameters:
    - df: DataFrame containing the data
    - a_values: List of unique values in column A for which to calculate the conditional probabilities
    - b_value: The specific value in column B for which to calculate the conditional probability
    - k: The smoothing parameter (default 1 for Laplace smoothing)

    Returns:
    A dictionary with keys as A values and values as the corresponding conditional probabilities with Laplace smoothing.
    """
    probabilities_laplace = {}
    for a_value in a_values:
        # Filtering df for the specific A value
        df_filtered_a = df[df['A'] == a_value]
        # Counting occurrences where B equals b_value given A equals a_value with Laplace smoothing
        count_b_given_a = len(df_filtered_a[df_filtered_a['B'] == b_value]) + k
        # Adjusting denominator for Laplace smoothing
        total_possible_b_values = len(df['B'].unique())  # Number of unique B values
        smoothed_denominator = len(df_filtered_a) + k * total_possible_b_values
        # Calculating the probability with Laplace smoothing
        probability_laplace = count_b_given_a / smoothed_denominator
        probabilities_laplace[a_value] = probability_laplace
    return probabilities_laplace


<h4 style = 'color:green'>Assignment - 5.1 Compute Conditional Probabilities</h4>

In [3]:
df = pd.DataFrame({
    '#': np.arange(1, 10),
    'A': ['A1', 'A2', 'A2', 'A1', 'A3', 'A3', 'A1', 'A3', 'A1'],
    'B': ['B1', 'B2', 'B2', 'B1', 'B4', 'B3', 'B4', 'B4', 'B3']
})
df

Unnamed: 0,#,A,B
0,1,A1,B1
1,2,A2,B2
2,3,A2,B2
3,4,A1,B1
4,5,A3,B4
5,6,A3,B3
6,7,A1,B4
7,8,A3,B4
8,9,A1,B3


In [5]:
# P(B=B1|A=A1); P(B=B1|A=A2); P(B=B1|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B1'
probabilities = calculate_conditional_probabilities(df, a_values, b_value)
print(probabilities)

{'A1': 0.5, 'A2': 0.0, 'A3': 0.0}


In [6]:
# P(B=B2|A=A1); P(B=B2|A=A2); P(B=B2|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B2'
probabilities = calculate_conditional_probabilities(df, a_values, b_value)
print(probabilities)

{'A1': 0.0, 'A2': 1.0, 'A3': 0.0}


In [7]:
# P(B=B3|A=A1); P(B=B3|A=A2); P(B=B3|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B3'
probabilities = calculate_conditional_probabilities(df, a_values, b_value)
print(probabilities)

{'A1': 0.25, 'A2': 0.0, 'A3': 0.3333333333333333}


In [8]:
# P(B=B4|A=A1); P(B=B4|A=A2); P(B=B4|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B4'
probabilities = calculate_conditional_probabilities(df, a_values, b_value)
print(probabilities)

{'A1': 0.25, 'A2': 0.0, 'A3': 0.6666666666666666}


In [9]:
# P(B=B5|A=A1); P(B=B5|A=A2); P(B=B5|A=A3)
a_values = ['A1', 'A2', 'A3']
b_value = 'B5'
probabilities = calculate_conditional_probabilities(df, a_values, b_value)
print(probabilities)

{'A1': 0.0, 'A2': 0.0, 'A3': 0.0}


<h4 style = 'color:green'>Assignment - 5.2 Compute Laplace Smoothing Probabilities</h4>

In [12]:
# P(B=B1|A=A1); P(B=B1|A=A2); P(B=B1|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B1'
probabilities_laplace = calculate_conditional_probabilities_laplace(df, a_values, b_value)
print(probabilities_laplace)

{'A1': 0.375, 'A2': 0.16666666666666666, 'A3': 0.14285714285714285}


In [13]:
# P(B=B2|A=A1); P(B=B2|A=A2); P(B=B2|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B2'
probabilities_laplace = calculate_conditional_probabilities_laplace(df, a_values, b_value)
print(probabilities_laplace)

{'A1': 0.125, 'A2': 0.5, 'A3': 0.14285714285714285}


In [14]:
# P(B=B3|A=A1); P(B=B3|A=A2); P(B=B3|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B3'
probabilities_laplace = calculate_conditional_probabilities_laplace(df, a_values, b_value)
print(probabilities_laplace)

{'A1': 0.25, 'A2': 0.16666666666666666, 'A3': 0.2857142857142857}


In [15]:
# P(B=B4|A=A1); P(B=B4|A=A2); P(B=B4|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B4'
probabilities_laplace = calculate_conditional_probabilities_laplace(df, a_values, b_value)
print(probabilities_laplace)

{'A1': 0.25, 'A2': 0.16666666666666666, 'A3': 0.42857142857142855}


In [16]:
# P(B=B5|A=A1); P(B=B5|A=A2); P(B=B5|A=A3)

a_values = ['A1', 'A2', 'A3']
b_value = 'B5'
probabilities_laplace = calculate_conditional_probabilities_laplace(df, a_values, b_value)
print(probabilities_laplace)

{'A1': 0.125, 'A2': 0.16666666666666666, 'A3': 0.14285714285714285}
