In [1]:
import pandas as pd
import numpy as np

#Load the table data
table_data = {
  'age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
  'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
  'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
  'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
  'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(table_data)

# Count the number of instances in each class
class_counts = df['buys_computer'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / df.shape[0]

print(prior_probabilities)

yes    0.642857
no     0.357143
Name: buys_computer, dtype: float64


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm

def map_age_to_numeric(age_str):
    if '-' in age_str:
        return int(age_str.split('-')[0])
    elif '>' in age_str:
        return int(age_str.replace('>', '')) + 1  # Add 1 to represent an age greater than the upper limit
    elif '<=' in age_str:
        return int(age_str.replace('<=', ''))
    else:
        return int(age_str)

#Load the table data
table_data = {
    'age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(table_data)

# Calculate class conditional densities for age (assuming a normal distribution)
for cls in df['buys_computer'].unique():
    for age_range in df['age'].unique():
        subset = df[df['buys_computer'] == cls]
        age_values = subset['age'].apply(map_age_to_numeric)
        mean_age = age_values.mean()
        std_dev_age = age_values.std()
        numeric_age = map_age_to_numeric(age_range)
        density = norm.pdf(numeric_age, mean_age, std_dev_age)
        print(f'P(age={age_range} | buys_computer={cls}) = {density}')

# Calculate class conditional densities for categorical features
categorical_features = ['income', 'student', 'credit_rating']
for cls in df['buys_computer'].unique():
    for feature in categorical_features:
        crosstab = pd.crosstab(df[feature], df['buys_computer'], margins=True, normalize=True)
        conditional_density = crosstab[cls] / crosstab['All']
        print(f'P({feature} | buys_computer={cls}) = {conditional_density}')

P(age=<=30 | buys_computer=no) = 0.050715988122805825
P(age=31-40 | buys_computer=no) = 0.0564682890594362
P(age=>40 | buys_computer=no) = 0.036339593436678766
P(age=<=30 | buys_computer=yes) = 0.056197530114517336
P(age=31-40 | buys_computer=yes) = 0.0642838998679617
P(age=>40 | buys_computer=yes) = 0.031820267130607836
P(income | buys_computer=no) = income
high      0.500000
low       0.250000
medium    0.333333
All       0.357143
dtype: float64
P(student | buys_computer=no) = student
no     0.571429
yes    0.142857
All    0.357143
dtype: float64
P(credit_rating | buys_computer=no) = credit_rating
excellent    0.500000
fair         0.250000
All          0.357143
dtype: float64
P(income | buys_computer=yes) = income
high      0.500000
low       0.750000
medium    0.666667
All       0.642857
dtype: float64
P(student | buys_computer=yes) = student
no     0.428571
yes    0.857143
All    0.642857
dtype: float64
P(credit_rating | buys_computer=yes) = credit_rating
excellent    0.500000
fai

In [3]:
import pandas as pd
from scipy.stats import chi2_contingency

#Load the table data
table_data = {
    'age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(table_data)

# Create a contingency table
contingency_table = pd.crosstab(index=df['age'], columns=[df['income'], df['student'], df['credit_rating']], margins=True)

# Perform the chi-squared test for independence
chi2, p, _, _ = chi2_contingency(contingency_table)

# Print the results
print(f"Chi-squared value: {chi2}")
print(f"P-value: {p}")

# Check if the null hypothesis (independence) is rejected
alpha = 0.05
if p < alpha:
    print("The features are not independent (reject the null hypothesis)")
else:
    print("The features are independent (fail to reject the null hypothesis)")

Chi-squared value: 12.95
P-value: 0.9895621498194425
The features are independent (fail to reject the null hypothesis)


In [4]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#Load the table data
table_data = {
    'age': ['<=30', '<=30', '31-40', '>40', '>40', '>40', '31-40', '<=30', '<=30', '>40', '<=30', '31-40', '31-40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(table_data)

# Assuming 'buys_computer' is the target variable
X = df.drop('buys_computer', axis=1)
y = df['buys_computer']

# Convert categorical variables to numerical using one-hot encoding
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
Tr_X, Te_X, Tr_y, Te_y = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build and train the Naïve-Bayes classifier
model = GaussianNB()
model.fit(Tr_X, Tr_y)

# Make predictions on the test set
predictions = model.predict(Te_X)

# Evaluate the accuracy of the model
accuracy = accuracy_score(Te_y, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6666666666666666


In [7]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_excel(r"C:\Users\DELL\Downloads\Untitled Folder\embeddingsdatasheet-1.xlsx")

# 'embed_0' and 'embed_1' are the features and 'Label' is the target variable
X = data[['embed_0', 'embed_1']]
y = data['Label']

# Split the data into training and testing sets
Tr_X, Te_X, Tr_y, Te_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the Naïve-Bayes classifier
model = GaussianNB()
model.fit(Tr_X, Tr_y)

# Make predictions on the test set
predictions = model.predict(Te_X)

# Evaluate the accuracy of the model
accuracy = accuracy_score(Te_y, predictions)
print(f"Accuracy: {accuracy}")    

Accuracy: 0.42777777777777776
