In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [16]:
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

df = pd.concat([X, y], axis=1)
df.head(2)
# lets check the shape
print("DataFrame shape:", df.shape)

df.target.value_counts(normalize=True)


DataFrame shape: (150, 5)


target
0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64

In [None]:
df.head(2)
# here 0 means setosa, 1 means versicolor and 2 means virginica

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [18]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

# Verify the distribution of target variable in both sets
print("Training set target distribution:\n", train_df['target'].value_counts(normalize=True))
print("Testing set target distribution:\n", test_df['target'].value_counts(normalize=True))

Training set shape: (120, 5)
Testing set shape: (30, 5)
Training set target distribution:
 target
0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64
Testing set target distribution:
 target
0    0.333333
2    0.333333
1    0.333333
Name: proportion, dtype: float64


In [19]:
train_df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
8,4.4,2.9,1.4,0.2,0
106,4.9,2.5,4.5,1.7,2


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Separate features and target variable
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']
# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
# Fit the model
knn.fit(X_train, y_train)
# Make predictions
y_pred = knn.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))  

Accuracy: 1.0


In [21]:
# save the model
import pickle
with open('knn_model_v0.pkl', 'wb') as f:
    pickle.dump(knn, f)


In [23]:
# lets try different value of k
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for k={k}: {acc}")


# lets train the model with k=7
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Final Accuracy for k=7: {acc}")

Accuracy for k=1: 0.9666666666666667
Accuracy for k=2: 0.9333333333333333
Accuracy for k=3: 1.0
Accuracy for k=4: 1.0
Accuracy for k=5: 1.0
Accuracy for k=6: 0.9666666666666667
Accuracy for k=7: 0.9666666666666667
Accuracy for k=8: 0.9666666666666667
Accuracy for k=9: 1.0
Accuracy for k=10: 1.0
Final Accuracy for k=7: 0.9666666666666667


In [24]:
# save it
with open('knn_model_k_7.pkl', 'wb') as f:
    pickle.dump(knn, f)

In [25]:
# lets split the data into three parts: train, validation and test
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42, stratify=train_val_df['target'])

In [26]:
# now I will use train_df to train the model, val_df to compute the best value of k
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
# Find the best k using validation set
k = [1,3,5,7,9]
best_acc = 0
for i in k:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = i

print(f"Best k: {best_k} with accuracy: {best_acc}")


Best k: 1 with accuracy: 0.9583333333333334


In [27]:
# lets train the model with best k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy for best k={best_k}: {acc}")

Validation Accuracy for best k=1: 0.9583333333333334


In [28]:
# use test set to evaluate the final model
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']
y_pred = knn.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy for best k={best_k}: {acc}")

Test Accuracy for best k=1: 1.0


In [29]:
# you can use train data to see the model performance
X_train_full = train_val_df.drop('target', axis=1)
y_train_full = train_val_df['target']
y_pred = knn.predict(X_train_full)
acc = accuracy_score(y_train_full, y_pred)
print(f"Train Accuracy for best k={best_k}: {acc}")

Train Accuracy for best k=1: 0.9916666666666667
