# KNN And Naive Bayes

## Importing Necessary Modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import matplotlib.pyplot as plt

dataset_path = "Z:\\Machine-Learning-Lab\\exp 2 KNN and Naive Bayes\\data.csv"
df = pd.read_csv(dataset_path)

df.drop(columns=['emp_id'], inplace=True)
print(df.head())
X = df.drop(columns=['risk_score'])
y = df['risk_score']

   income  expected_income  risk_score   quality
0    3135              550       36200  0.580918
1    3180              600       30150  0.730720
2    1540              450       34550  0.531712
3    5230              700       42150  0.792552
4    3590             1100       53850  0.744634


## Spliting the Dataset

In [2]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Finding train and test accuracy with Euclidean,Manhattan,Minkowski distance

In [3]:
def evaluate_knn_classifier(X_train, X_test, y_train, y_test, n_neighbors, distance_metric):
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors, metric=distance_metric)
    knn_classifier.fit(X_train, y_train)
    train_pred = knn_classifier.predict(X_train)
    test_pred = knn_classifier.predict(X_test)
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    return train_accuracy, test_accuracy

n_neighbors = 3
distance_metrics = ['euclidean', 'manhattan', 'minkowski']

for metric in distance_metrics:
    train_acc, test_acc = evaluate_knn_classifier(X_train, X_test, y_train, y_test, n_neighbors, metric)
    print(f"Distance Metric: {metric}")
    print(f"Train Accuracy: {train_acc}")
    print(f"Test Accuracy: {test_acc}")
    print()


Distance Metric: euclidean
Train Accuracy: 0.33421750663129973
Test Accuracy: 0.0008375209380234506

Distance Metric: manhattan
Train Accuracy: 0.3339382940108893
Test Accuracy: 0.0008375209380234506

Distance Metric: minkowski
Train Accuracy: 0.33421750663129973
Test Accuracy: 0.0008375209380234506



## Optimal K value:

In [4]:
def find_optimal_k(X_train, X_test, y_train, y_test, max_k, distance_metric):
    best_k = 0
    best_test_accuracy = 0
    for k in range(1, max_k + 1):
        _, test_acc = evaluate_knn_classifier(X_train, X_test, y_train, y_test, k, distance_metric)
        if test_acc > best_test_accuracy:
            best_test_accuracy = test_acc
            best_k = k
    return best_k, best_test_accuracy

# Find optimal K value
max_k = 12
optimal_k, optimal_test_accuracy = find_optimal_k(X_train, X_test, y_train, y_test, max_k, 'euclidean')
print(f"Optimal K value: {optimal_k}")
print(f"Test Accuracy with Optimal K: {optimal_test_accuracy}")

Optimal K value: 10
Test Accuracy with Optimal K: 0.0013958682300390843


## Rebuild KNN classifier with The Optimal K: (i.e. find the best clusetering)

In [5]:
optimal_knn_classifier = KNeighborsClassifier(n_neighbors=optimal_k, metric='euclidean')
optimal_knn_classifier.fit(X_train, y_train)



knn_regressor = KNeighborsRegressor(n_neighbors=5)  # You can adjust n_neighbors
knn_regressor.fit(X_train, y_train)
y_pred_regression = knn_regressor.predict(X_test)
mse_regression = mean_squared_error(y_test, y_pred_regression)
print(f"Mean Squared Error (Regression): {mse_regression}")

Mean Squared Error (Regression): 235494991.5410385


## Regression

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
y_reg = df['income']
X_reg = df.drop(columns=['income'])
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg,test_size=0.2,random_state=42)
best_k =1
best_r2 = float('-inf')
best_mse = float('inf')
for k in range(1, 21):
    knn_reg = KNeighborsRegressor(n_neighbors=k)
    knn_reg.fit(X_train_reg, y_train_reg)
    y_pred_reg = knn_reg.predict(X_test_reg)
    r2 = r2_score(y_test_reg, y_pred_reg)
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    if r2 > best_r2 and mse < best_mse:
        best_r2 = r2
        best_mse = mse
        best_k = k
knn_reg_best = KNeighborsRegressor(n_neighbors=best_k)
knn_reg_best.fit(X_train_reg, y_train_reg)
y_pred_reg_best = knn_reg_best.predict(X_test_reg)
r2_best = r2_score(y_test_reg, y_pred_reg_best)
mse_best = mean_squared_error(y_test_reg, y_pred_reg_best)
print(f"OptimaI k for regression: {best_k}")
print(f"Best RA2 Score for regression: {r2_best:.4f}")
print(f"Mean Squared Error for regression: {mse_best:.4f}")

OptimaI k for regression: 20
Best RA2 Score for regression: 0.0760
Mean Squared Error for regression: 2016074.5869


# Naive Bayes

##  Imports

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix

## Generate Synthetic Data

In [8]:
def generate_data(n_samples, n_features, n_classes):
    X = np.random.rand(n_samples, n_features)
    y = np.random.randint(0, n_classes, size=n_samples)
    return X, y

# Compute Frequency Table

In [9]:
def compute_frequency_table(y):
    frequency_table = pd.value_counts(y) / len(y)
    return frequency_table

## Train Naive Bayes Classifiers and Evaluate

In [10]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Gaussian Naive Bayes
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    gnb_predictions = gnb.predict(X_test)
    # Multinomial Naive Bayes
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    mnb_predictions = mnb.predict(X_test)
    # Bernoulli Naive Bayes
    bnb = BernoulliNB()
    bnb.fit(X_train, y_train)
    bnb_predictions = bnb.predict(X_test)
    gnb_confusion_matrix = confusion_matrix(y_test, gnb_predictions)
    mnb_confusion_matrix = confusion_matrix(y_test, mnb_predictions)
    bnb_confusion_matrix = confusion_matrix(y_test, bnb_predictions)
    return gnb_confusion_matrix, mnb_confusion_matrix, bnb_confusion_matrix

## Main

In [11]:
# Generate synthetic data
n_samples = 1000
n_features = 10
n_classes = 2
X, y = generate_data(n_samples, n_features, n_classes)
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Train and evaluate classifiers
gnb_confusion_matrix, mnb_confusion_matrix, bnb_confusion_matrix = train_and_evaluate(X_train, X_test, y_train, y_test)
# Compute frequency table
frequency_table = compute_frequency_table(y)
# Print results
print("Frequency Table:")
print(frequency_table)
print("\nGaussian Naive Bayes Confusion Matrix:")
print("\nMultinomiaI Naive Bayes Confusion Matrix:")
print(mnb_confusion_matrix)
print("\nBernoulli Naive Bayes Confusion Matrix:")
print(bnb_confusion_matrix)

Frequency Table:
1    0.535
0    0.465
Name: count, dtype: float64

Gaussian Naive Bayes Confusion Matrix:

MultinomiaI Naive Bayes Confusion Matrix:
[[  0  86]
 [  0 114]]

Bernoulli Naive Bayes Confusion Matrix:
[[  0  86]
 [  0 114]]


  frequency_table = pd.value_counts(y) / len(y)
