<a href="https://colab.research.google.com/github/sp2743/datamining-samplequestion/blob/main/NavieBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Task 01: Naive Bayes on Categorical Data


    1.1 Generate a random categorical dataset with following config:
        Number of attribute 4.
        Number of unique values for each attribute:  [3, 3, 2, 2]
        Number of classes: 2
        
        [Refer to Dataset example dicussed in Lecture for more information]
        
    1.2 Implement Categorical Naive Bayes algorithm
        Calculate all the required probabilites and store for prediction.
    
    1.3 Test your Model on a new  test set generated randomly with same config as 1.1
    
    1.4 Report model performance in terms of Precision, Recall, F-score.

In [2]:
import numpy as np
import pandas as pd

In [None]:
num_samples = 20
num_attributes = 4
unique_values = [3, 3, 2, 2]
num_classes = 2
np.random.seed(42)  # For reproducibility
data = np.column_stack([np.random.randint(0, unique_values[i], num_samples) for i in range(num_attributes)])
target = np.random.randint(0, num_classes, num_samples)
df = pd.DataFrame(data, columns=[f'Attribute_{i+1}' for i in range(num_attributes)])
df['Class'] = target

In [None]:
df.head()

Unnamed: 0,Attribute_1,Attribute_2,Attribute_3,Attribute_4,Class
0,2,0,0,1,0
1,0,0,0,0,0
2,2,1,0,1,1
3,2,1,0,0,1
4,0,0,0,1,1


In [None]:
class_counts = np.bincount(target, minlength=num_classes)
class_probs = class_counts / num_samples
print(class_counts)
print(class_probs)

[ 6 14]
[0.3 0.7]


In [None]:
conditional_probs = {}
for attribute_index in range(num_attributes):
    conditional_probs[attribute_index] = {}
    for class_label in range(num_classes):
        subset = df[df['Class'] == class_label][f'Attribute_{attribute_index+1}']
        value_counts = np.bincount(subset, minlength=unique_values[attribute_index])
        conditional_probs[attribute_index][class_label] = (value_counts + 1) / (len(subset) + unique_values[attribute_index])

conditional_probs

{0: {0: array([0.44444444, 0.22222222, 0.33333333]),
  1: array([0.17647059, 0.35294118, 0.47058824])},
 1: {0: array([0.33333333, 0.44444444, 0.22222222]),
  1: array([0.29411765, 0.23529412, 0.47058824])},
 2: {0: array([0.5, 0.5]), 1: array([0.5, 0.5])},
 3: {0: array([0.25, 0.75]), 1: array([0.3125, 0.6875])}}

In [None]:
def predict(sample):
    probs = np.log(class_probs.copy())
    for attribute_index, attribute_val in enumerate(sample):
        for class_label in range(num_classes):
            probs[class_label] += np.log(conditional_probs[attribute_index][class_label][attribute_val])
    return np.argmax(probs)

In [None]:
new_sample = [1, 2, 0, 1]  # Example new data point
predicted_class = predict(new_sample)
print(f'Predicted class for {new_sample}: {predicted_class}')

Predicted class for [1, 2, 0, 1]: 1


### Task02: Naive Bayes on Continuous Data
    
    2.1 Generate a random categorical dataset with following config:
        Number of attribute 4.
        Range of values for each attribute:  [1-10, 5-15, 0-5, 1-20]
        Number of classes: 2
        
        [Refer to https://web.iitd.ac.in/~bspanda/BY.pdf  (slide no. 27 for more information)]
        
    2.2 Implement Gaussian Naive Bayes algorithm
        Calculate all the required statistics and store for prediction.
    
    2.3 Test your Model on a new test set generated randomly with same config as 2.1
    
    2.4 Report model performance in terms of Precision, Recall, F-score.
    
https://web.iitd.ac.in/~bspanda/BY.pdf

In [7]:
num_samples=20
num_attributes1=4
num_classes1=2
np.random.seed(42)
data=np.column_stack([np.random.randint(1,11,num_samples),np.random.randint(5,16,num_samples),np.random.randint(0,6,num_samples),np.random.randint(1,21,num_samples)])
target=np.random.randint(0,num_classes1,num_samples)
df1=pd.DataFrame(data,columns=[f'Attribute_{i+1}' for i in range(num_attributes1)])
df1['Class']=target

In [8]:
df1.head()

Unnamed: 0,Attribute_1,Attribute_2,Attribute_3,Attribute_4,Class
0,7,9,1,18,1
1,4,5,3,8,0
2,8,14,0,4,0
3,5,10,3,2,1
4,7,13,5,6,1


In [13]:
mean_std = {}
for c in range(num_classes1):
    mean_std[c] = {}

for c in range(num_classes1):
    class_data = df1[df1['Class'] == c]
    for i in range(num_attributes1):
        mean = class_data[f'Attribute_{i+1}'].mean()
        std = class_data[f'Attribute_{i+1}'].std()
        std = std if std > 0 else 1e-6  # Prevent division by zero
        mean_std[c][i] = (mean, std)

mean_std

{0: {0: (np.float64(5.615384615384615), 2.59930957302324),
  1: (np.float64(9.0), 2.857738033247041),
  2: (np.float64(2.6923076923076925), 1.3774744634423892),
  3: (np.float64(9.692307692307692), 4.553105449115936)},
 1: {0: (np.float64(6.142857142857143), 1.6761634196950517),
  1: (np.float64(12.714285714285714), 2.3603873774083297),
  2: (np.float64(1.5714285714285714), 1.8126539343499317),
  3: (np.float64(10.857142857142858), 6.817344825970774)}}

In [14]:
def gaussian_bayes(x, mean, std):
    exponent = np.exp(-((x - mean) ** 2) / (2 * std ** 2))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

gaussian={}
for c in range(num_classes1):
  gaussian[c]={}

for c in range(num_classes1):
  for i in range(num_attributes1):
    gaussian[c][i]=gaussian_bayes(data,mean_std[c][0][0],mean_std[c][0][1])

In [15]:
print("Class-wise Mean and Standard Deviation:")
print(mean_std)

Class-wise Mean and Standard Deviation:
{0: {0: (np.float64(5.615384615384615), 2.59930957302324), 1: (np.float64(9.0), 2.857738033247041), 2: (np.float64(2.6923076923076925), 1.3774744634423892), 3: (np.float64(9.692307692307692), 4.553105449115936)}, 1: {0: (np.float64(6.142857142857143), 1.6761634196950517), 1: (np.float64(12.714285714285714), 2.3603873774083297), 2: (np.float64(1.5714285714285714), 1.8126539343499317), 3: (np.float64(10.857142857142858), 6.817344825970774)}}


In [16]:
def predict(sample):
    class_probs = {}
    for c in range(num_classes1):
        class_probs[c] = np.log(df1['Class'].value_counts()[c] / len(df1))  # Prior probability
        for i in range(num_attributes1):
            class_probs[c] += np.log(gaussian_bayes(sample[i], mean_std[c][i][0], mean_std[c][i][1]))
    return max(class_probs, key=class_probs.get)

In [18]:
test_samples = 10
test_data = np.column_stack([
    np.random.randint(1, 11, test_samples),
    np.random.randint(5, 16, test_samples),
    np.random.randint(0, 6, test_samples),
    np.random.randint(1, 21, test_samples)
])

test_target = np.random.randint(0, num_classes1, test_samples)

y_pred = [predict(sample) for sample in test_data]

In [19]:
def compute_metrics(y_true, y_pred):
    tp = sum((y_true[i] == 1 and y_pred[i] == 1) for i in range(len(y_true)))
    tn = sum((y_true[i] == 0 and y_pred[i] == 0) for i in range(len(y_true)))
    fp = sum((y_true[i] == 0 and y_pred[i] == 1) for i in range(len(y_true)))
    fn = sum((y_true[i] == 1 and y_pred[i] == 0) for i in range(len(y_true)))

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f_score

precision, recall, f_score = compute_metrics(test_target, y_pred)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F-score: {f_score:.2f}")

Precision: 1.00
Recall: 0.20
F-score: 0.33
