# LAB 5 : Write a program to implement k-Nearest Neighbor algorithm to classify the iris data Set.

In [1]:
# import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# URL of the CSV file (use the "raw" link)
url = 'https://github.com/sonarhrushikesh/ML_Dataset/raw/main/insurance.csv'

# Read dataset to pandas dataframe
dataset = pd.read_csv(url)

## EDA

In [3]:
# Dimension of Dataset
dataset.shape

(51, 7)

In [4]:
dataset.head()

Unnamed: 0,Health Insurance Coverage Change (2010-2015),Employer Health Insurance Coverage (2015),Marketplace Health Insurance Coverage (2016),Marketplace Tax Credits (2016),Medicaid Enrollment (2016),Medicare Enrollment (2016),State Medicaid Expansion (2016)
0,215000,2545000,165534,152206,910775,989855,False
1,36000,390000,17995,16205,166625,88966,True
2,410000,3288000,179445,124346,1716198,1175624,True
3,234000,1365000,63357,56843,920194,606146,True
4,3826000,19552000,1415428,1239893,11843081,5829777,True


In [5]:
# info of dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 7 columns):
 #   Column                                        Non-Null Count  Dtype
---  ------                                        --------------  -----
 0   Health Insurance Coverage Change (2010-2015)  51 non-null     int64
 1   Employer Health Insurance Coverage (2015)     51 non-null     int64
 2   Marketplace Health Insurance Coverage (2016)  51 non-null     int64
 3   Marketplace Tax Credits (2016)                51 non-null     int64
 4   Medicaid Enrollment (2016)                    51 non-null     int64
 5   Medicare Enrollment (2016)                    51 non-null     int64
 6   State Medicaid Expansion (2016)               51 non-null     bool 
dtypes: bool(1), int64(6)
memory usage: 2.6 KB


In [6]:
# Description of dataset
dataset.describe()

Unnamed: 0,Health Insurance Coverage Change (2010-2015),Employer Health Insurance Coverage (2015),Marketplace Health Insurance Coverage (2016),Marketplace Tax Credits (2016),Medicaid Enrollment (2016),Medicare Enrollment (2016)
count,51.0,51.0,51.0,51.0,51.0,51.0
mean,384098.0,3378275.0,217281.0,184110.0,1441822.0,1095961.0
std,606046.7,3657440.0,313579.5,281228.8,1935859.0,1147094.0
min,15000.0,335000.0,13313.0,1224.0,63583.0,88966.0
25%,87500.0,875500.0,50436.0,43343.0,304043.5,318978.0
50%,215000.0,2295000.0,130178.0,95507.0,967284.0,817272.0
75%,407000.0,4109000.0,224111.0,188443.5,1700263.0,1259598.0
max,3826000.0,19552000.0,1531714.0,1428712.0,11843080.0,5829777.0


In [7]:
# Grouping the Values
dataset.groupby('State Medicaid Expansion (2016)').size()

State Medicaid Expansion (2016)
False    19
True     32
dtype: int64

## Data Preprocessing

In [8]:
# Selecting data using iloc based on integer positions
X = dataset.iloc [ : , : -1 ]
Y = dataset.iloc [ : , -1]

## Splitting the dataset into the Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state = 0)

## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Training the K-NN model on the Training set

In [11]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, p=1, metric='minkowski')
classifier.fit(X_train, Y_train)

## Getting nearest neighbours for each point in training data

In [12]:
classifier.kneighbors(X = X_train, n_neighbors=5, return_distance=False)

array([[ 0, 10,  4, 33, 26],
       [ 1, 14, 30,  7,  4],
       [ 2, 18, 32, 10, 17],
       [ 3,  6,  5, 22,  8],
       [ 4,  0, 30, 26, 14],
       [ 5, 22,  8, 23,  3],
       [ 6,  3, 22, 31,  5],
       [ 7,  4, 33, 26,  0],
       [ 8, 22,  5, 20, 23],
       [ 9, 28, 13, 21,  7],
       [10, 24,  0, 15,  2],
       [11, 19, 12, 29, 27],
       [12, 19, 11, 29, 27],
       [13, 21,  7, 33,  1],
       [14, 30,  4, 32, 18],
       [15, 16, 17, 10, 24],
       [16, 15, 17, 24, 10],
       [17, 16, 15, 25, 18],
       [18, 32,  2, 30, 14],
       [19, 12, 11, 29, 27],
       [20, 27, 29,  8,  3],
       [21, 13,  7,  1, 33],
       [22,  5, 31,  8, 23],
       [23, 25,  5, 22,  8],
       [24, 10, 16, 15, 32],
       [25, 23, 22, 31, 32],
       [26,  4,  0, 30, 33],
       [27, 20, 29, 12, 19],
       [28,  9, 13, 21,  7],
       [29, 12, 19, 27, 11],
       [30, 14, 18, 32,  4],
       [31, 22,  6,  5, 25],
       [32, 18, 30,  2, 25],
       [33,  0, 26,  4, 10],
       [34,  8

## Predicting the Test set results

In [16]:
# Predicting labels for the test data using the classifier
y_pred = classifier.predict(X_test)

# Concatenating the predicted labels and the true labels along the second axis
# Reshaping both arrays to have a single column using the reshape method
# Then, using numpy's concatenate function to concatenate them along the second axis (axis=1)
# Finally, printing the concatenated array
print(np.concatenate((y_pred.reshape(len(y_pred), 1),
                      y_test.values.reshape(len(y_test), 1)), 1))

[[False  True]
 [ True  True]
 [ True False]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True  True]
 [ True False]
 [ True False]
 [False False]
 [ True  True]
 [ True  True]
 [ True  True]]


## Evaluating the Algorithm
## Making the Confusion Matrix & Predicting Accuracy Score

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculating the confusion matrix
# confusion_matrix() takes true labels (Y_test) and predicted labels (y_pred) as inputs
cm = confusion_matrix(Y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Calculating the accuracy of the model
# accuracy_score() computes the accuracy of the classification based on the true and predicted labels
accuracy = accuracy_score(Y_test, y_pred) * 100

# Printing the accuracy of the model with two decimal places
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

[[ 1  3]
 [ 1 11]]
Accuracy of our model is equal 75.0 %.


## Making Classification Report

In [25]:
from sklearn.metrics import classification_report

# Calculate and print the classification report, which includes precision, recall, F1-score, and support
# classification_report() takes true labels (Y_test) and predicted labels (y_pred) as inputs
# It returns a text report showing the main classification metrics
# Each row in the report corresponds to a class, with metrics calculated for each class
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

       False       0.50      0.25      0.33         4
        True       0.79      0.92      0.85        12

    accuracy                           0.75        16
   macro avg       0.64      0.58      0.59        16
weighted avg       0.71      0.75      0.72        16



## Comparing Error Rate with the K Value
### Parameter Tuning Using 

In [19]:
from sklearn.model_selection import cross_val_score

# creating list of K for KNN
k_list = list(range(1,17))

# creating list of cv scores
cv_scores = []

# perform 10-fold cross validation
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, Y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())


### Finding best k

In [24]:
# Find the index of the minimum MSE in the list
best_k = k_list[MSE.index(min(MSE))]  

# Print the optimal number of neighbors
print("The optimal number of neighbors is %d." % best_k)

The optimal number of neighbors is 1.
