# Lab 8 

## A1: Calculate Prior Probability for Each Class

In [5]:
import pandas as pd

# Creating the dataset based on the provided table
data = {
    'age': ['<=30', '<=30', '31…40', '>40', '>40', '>40', '31…40', '<=30', '<=30', '>40', '<=30', '31…40', '31…40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(data)

# Calculating prior probabilities for each class
total_data_points = len(df)
prior_probabilities = df['buys_computer'].value_counts() / total_data_points

# Displaying the prior probabilities
print("Prior Probabilities for Each Class:")
print(prior_probabilities)

Prior Probabilities for Each Class:
buys_computer
yes    0.642857
no     0.357143
Name: count, dtype: float64


# A2: Class Conditional Densities

In [6]:
# Calculating class conditional probabilities for each feature
class_conditional = df.groupby('buys_computer').apply(lambda x: x.apply(lambda y: y.value_counts(normalize=True)))

# Displaying the class conditional probabilities
print("Class Conditional Densities:")
print(class_conditional)

Class Conditional Densities:
                              age    income   student  credit_rating  \
buys_computer                                                          
no            <=30       0.600000       NaN       NaN            NaN   
              >40        0.400000       NaN       NaN            NaN   
              excellent       NaN       NaN       NaN       0.600000   
              fair            NaN       NaN       NaN       0.400000   
              high            NaN  0.400000       NaN            NaN   
              low             NaN  0.200000       NaN            NaN   
              medium          NaN  0.400000       NaN            NaN   
              no              NaN       NaN  0.800000            NaN   
              yes             NaN       NaN  0.200000            NaN   
yes           31…40      0.444444       NaN       NaN            NaN   
              <=30       0.222222       NaN       NaN            NaN   
              >40        0.333333  

# A3: Test for Independence Between Features

In [7]:
# Converting categorical data to numerical values for correlation calculation
df_encoded = df.apply(lambda x: pd.factorize(x)[0])

# Calculating the correlation matrix to check independence
correlation_matrix = df_encoded.corr()

# Displaying the correlation matrix
print("Correlation Matrix (Independence Test):")
print(correlation_matrix)

Correlation Matrix (Independence Test):
                        age    income       student  credit_rating  \
age            1.000000e+00  0.335410  1.690309e-01  -5.688200e-17   
income         3.354102e-01  1.000000  5.669467e-01   1.909407e-01   
student        1.690309e-01  0.566947  1.000000e+00   1.602469e-17   
credit_rating -5.688200e-17  0.190941  1.602469e-17   1.000000e+00   
buys_computer  1.763834e-01  0.197203  4.472136e-01  -2.581989e-01   

               buys_computer  
age                 0.176383  
income              0.197203  
student             0.447214  
credit_rating      -0.258199  
buys_computer       1.000000  


## A4: Build Naive-Bayes Classifier for Provided Data

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Preparing the features and target
X = df.drop('buys_computer', axis=1)
y = df['buys_computer']

# Encoding categorical variables
X_encoded = X.apply(lambda x: pd.factorize(x)[0])
y_encoded = pd.factorize(y)[0]

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# Building and training the Naive Bayes classifier
model = GaussianNB()
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Classifier Accuracy: {accuracy * 100:.2f}%")

Naive Bayes Classifier Accuracy: 60.00%


## A5: Build Naive-Bayes Classifier for Obesity Dataset

In [9]:
# Load your obesity dataset (replace the file path with the correct one)
obesity_data = pd.read_csv('Obesity.csv')

# Inspect the dataset
print(obesity_data.head())

# Assuming 'NObeyesdad' is the target variable (classification target)
X_obesity = obesity_data.drop('NObeyesdad', axis=1)
y_obesity = obesity_data['NObeyesdad']

# Encode categorical columns using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in X_obesity.select_dtypes(include=['object']).columns:
    X_obesity[col] = le.fit_transform(X_obesity[col])

# Encode the target column (y_obesity)
y_obesity_encoded = le.fit_transform(y_obesity)

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train_obesity, X_test_obesity, y_train_obesity, y_test_obesity = train_test_split(X_obesity, y_obesity_encoded, test_size=0.3, random_state=42)

# Train a Naive Bayes classifier on the obesity dataset
model_obesity = GaussianNB()
model_obesity.fit(X_train_obesity, y_train_obesity)

# Predict on the test set and evaluate the model
y_pred_obesity = model_obesity.predict(X_test_obesity)
accuracy_obesity = accuracy_score(y_test_obesity, y_pred_obesity)

print(f"Naive Bayes Classifier Accuracy for Obesity Dataset: {accuracy_obesity * 100:.2f}%")

    Age  Gender  Height  Weight        CALC FAVC  FCVC  NCP  SCC SMOKE  CH2O  \
0  21.0  Female    1.62    64.0          no   no   2.0  3.0   no    no   2.0   
1  21.0  Female    1.52    56.0   Sometimes   no   3.0  3.0  yes   yes   3.0   
2  23.0    Male    1.80    77.0  Frequently   no   2.0  3.0   no    no   2.0   
3  27.0    Male    1.80    87.0  Frequently   no   3.0  3.0   no    no   2.0   
4  22.0    Male    1.78    89.8   Sometimes   no   2.0  1.0   no    no   2.0   

  family_history_with_overweight  FAF  TUE       CAEC                 MTRANS  \
0                            yes  0.0  1.0  Sometimes  Public_Transportation   
1                            yes  3.0  0.0  Sometimes  Public_Transportation   
2                            yes  2.0  1.0  Sometimes  Public_Transportation   
3                             no  2.0  0.0  Sometimes                Walking   
4                             no  0.0  0.0  Sometimes  Public_Transportation   

            NObeyesdad  
0        Norm