In [3]:
#q1
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# --- Load and Prepare Data ---
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- (i) Step-by-step Implementation ---
class CustomGaussianNB:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.params = {}
        for c in self.classes:
            X_c = X[y == c]
            # Calculate Prior Probability P(c)
            prior = len(X_c) / len(X)
            # Calculate Mean and Variance for each feature in class c
            mean = np.mean(X_c, axis=0)
            variance = np.var(X_c, axis=0)
            self.params[c] = {'prior': prior, 'mean': mean, 'var': variance}

    def _gaussian_density(self, X, mean, var):
        # Calculate P(x_i | c) using Gaussian (Normal) Distribution PDF
        numerator = np.exp(-((X - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    def predict(self, X):
        posteriors = []
        for c in self.classes:
            p = self.params[c]
            # Initialize log likelihood with log(P(c)) for each sample
            log_likelihood = np.log(p['prior'])
            
            # Sum the log of the conditional probabilities P(x_i | c) for each sample
            for i in range(X.shape[1]):
                log_likelihood += np.log(self._gaussian_density(X[:, i], p['mean'][i], p['var'][i]))

            posteriors.append(log_likelihood)
        
        # Convert posteriors to array and predict the class with the highest log-posterior probability
        posteriors = np.array(posteriors)
        return np.argmax(posteriors, axis=0)

# Train and Evaluate Custom Model
custom_gnb = CustomGaussianNB()
custom_gnb.fit(X_train, y_train)
y_pred_custom = custom_gnb.predict(X_test)
accuracy_custom = accuracy_score(y_test, y_pred_custom)

print("--- 1. Gaussian Naïve Bayes Results ---")
print(f"(i) Step-by-Step Implementation Accuracy: {accuracy_custom:.4f}")

# --- (ii) In-built function ---
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
y_pred_inbuilt = gnb_model.predict(X_test)
accuracy_inbuilt = accuracy_score(y_test, y_pred_inbuilt)

print(f"(ii) In-built Function Accuracy: {accuracy_inbuilt:.4f}")

--- 1. Gaussian Naïve Bayes Results ---
(i) Step-by-Step Implementation Accuracy: 0.9778
(ii) In-built Function Accuracy: 0.9778


In [2]:
#q2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_wine # Using Wine dataset for diversity

# --- Load and Prepare Data ---
wine = load_wine()
X_knn, y_knn = wine.data, wine.target
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(
    X_knn, y_knn, test_size=0.3, random_state=42
)

# --- Define the Model and Parameter Grid ---
knn = KNeighborsClassifier()

# Define the range of K values to search (e.g., K from 1 to 20)
param_grid = {'n_neighbors': np.arange(1, 21)}

# --- Implement GridSearchCV ---
# GridSearchCV performs an exhaustive search over the specified parameter values 
# for an estimator, using cross-validation (cv=5) to score the best set.
grid_search = GridSearchCV(
    estimator=knn, 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy'
)

# Fit the grid search to the training data
grid_search.fit(X_train_knn, y_train_knn)

# --- Results ---
best_k = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_
test_accuracy = grid_search.score(X_test_knn, y_test_knn)

print("\n--- 2. GridSearchCV for K-NN Hyperparameter Tuning ---")
print(f"Dataset Used: Wine Dataset")
print(f"The Best value of K found by GridSearchCV: **K = {best_k}**")
print(f"Cross-Validation Accuracy with Best K: {best_score:.4f}")
print(f"Test Set Accuracy with Best K: {test_accuracy:.4f}")


--- 2. GridSearchCV for K-NN Hyperparameter Tuning ---
Dataset Used: Wine Dataset
The Best value of K found by GridSearchCV: **K = 1**
Cross-Validation Accuracy with Best K: 0.7023
Test Set Accuracy with Best K: 0.7963
