In [7]:
! pip install prettytable




In [35]:
# Importing necessary libraries
import pandas as pd  # For data manipulation and DataFrame creation
import numpy as np  # For numerical operations and array handling
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.neighbors import KNeighborsClassifier  # For using the K-Nearest Neighbors algorithm
from sklearn.model_selection import cross_validate  # For cross-validation
from prettytable import PrettyTable  # For displaying results in a table format

# Read the data from a CSV file and inspect it
data = pd.read_csv('../DataSets/iris.csv')  # Load the Iris dataset from the specified CSV file
data.tail()  # Display the first 5 rows of the data to check its structure and content



Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [19]:
# Split the dataset into training and testing sets (80% training, 20% testing)
characteristics = data.iloc[:, :4]  # Extract the first 4 columns as features (characteristics)
target = data.iloc[:, -1]  # Extract the last column as the target variable (labels)

# Perform the train/test split
x_train, x_test, y_train, y_test = train_test_split(characteristics, target, test_size=0.2, random_state=2727)

In [None]:
# Cross-validation for hyperparameter tuning
hyperparameter_score_list = []  # Initialize an empty list to store the scores for different hyperparameter combinations

# Loop through different values of p (distance metric) and n_neighbors (number of neighbors)
for p in range(1, 4):  # Iterate over p values from 1 to 3
    for neighbor in range(2, 7):  # Iterate over n_neighbors values from 2 to 6
        knn = KNeighborsClassifier(p=p, n_neighbors=neighbor)  # Create a KNN model with the current hyperparameters
        # Perform cross-validation with 10 folds and calculate accuracy scores
        scores = cross_validate(knn, x_train, y_train, cv=10, scoring='accuracy')
        mean_score = np.mean(scores['test_score'])  # Compute the mean accuracy score from the cross-validation results
        # Append the current hyperparameters and their mean score to the score list
        hyperparameter_score_list.append([p, neighbor, mean_score])

In [24]:
for row in hyperparameter_score_list:
    print(row)

[1, 2, np.float64(0.9583333333333333)]
[1, 3, np.float64(0.9666666666666666)]
[1, 4, np.float64(0.9666666666666666)]
[1, 5, np.float64(0.975)]
[1, 6, np.float64(0.9666666666666666)]
[2, 2, np.float64(0.9666666666666666)]
[2, 3, np.float64(0.9833333333333332)]
[2, 4, np.float64(0.975)]
[2, 5, np.float64(0.9833333333333332)]
[2, 6, np.float64(0.975)]
[3, 2, np.float64(0.9666666666666666)]
[3, 3, np.float64(0.975)]
[3, 4, np.float64(0.975)]
[3, 5, np.float64(0.9833333333333332)]
[3, 6, np.float64(0.975)]


In [25]:

# choose the hyper-parameters (with highest average accuracy)
myTable = PrettyTable(["p (distance)", "Number of neighbors", "Avg accuracy"])
for row in hyperparameter_score_list:
    myTable.add_row([row[0], row[1], round(row[2],3)])
print(myTable)




+--------------+---------------------+--------------+
| p (distance) | Number of neighbors | Avg accuracy |
+--------------+---------------------+--------------+
|      1       |          2          |    0.958     |
|      1       |          3          |    0.967     |
|      1       |          4          |    0.967     |
|      1       |          5          |    0.975     |
|      1       |          6          |    0.967     |
|      2       |          2          |    0.967     |
|      2       |          3          |    0.983     |
|      2       |          4          |    0.975     |
|      2       |          5          |    0.983     |
|      2       |          6          |    0.975     |
|      3       |          2          |    0.967     |
|      3       |          3          |    0.975     |
|      3       |          4          |    0.975     |
|      3       |          5          |    0.983     |
|      3       |          6          |    0.975     |
+--------------+------------

In [23]:
# Evaluate the performance of the KNN model with the best parameters on the testing set
knn = KNeighborsClassifier(p=2, n_neighbors=3)  # Create an instance of KNeighborsClassifier with specified parameters
# p=2 indicates the use of the Euclidean distance metric, n_neighbors=3 indicates using 3 nearest neighbors

knn_best_model = knn.fit(x_train, y_train)  # Fit the model to the training data (features and target labels)

# Evaluate the model on the testing set and print the accuracy score
print("Best Model Testing Score: ", knn_best_model.score(x_test, y_test))  # Calculate and display the model's accuracy on the test set


Best Model Testing Score:  0.9333333333333333


In [20]:
prediction = knn_best_model.predict(x_test)
prediction[:2]


array(['Versicolor', 'Setosa'], dtype=object)

In [17]:
y_test[:2]

119    Virginica
4         Setosa
Name: variety, dtype: object