In [1]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [2]:
cd "/content/drive/MyDrive/Data Analytics/Datasets"

/content/drive/MyDrive/Data Analytics/Datasets


## Implementation of KNN using sklearn

For the sklearn toy dataset "Breast Cancer", you need to perform KNN on this dataset using Sklearn.

Note: Use "random_state=42"

Output

Testing score rounded to 2 decimal places

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

bc = load_breast_cancer()

x = bc.data
y = bc.target

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)

clf = KNeighborsClassifier()
clf.fit(x_train, y_train)

score = round(clf.score(x_test, y_test), 2)

print(score)

0.97


## Implementation of Cross Validation using sklearn

For the sklearn toy dataset "Iris", you need to calculate the Cross-validation score using the sklearn library.

Note: While using KFold keep the parameters as:

n_splits=3,
shuffle=True,
random_state=42
Output

Print the cross_val_score for 3 splits.

In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

iris = load_iris()

x = iris.data
y = iris.target

clf = LinearRegression()

cvs = cross_val_score(clf, x, y, cv = KFold(n_splits = 3, random_state = 42, shuffle = True))

print(cvs)

[0.93882777 0.91104195 0.92583726]


## Find optimal K

For the sklearn toy dataset "Iris", you need to calculate the optimal value of K for the KNN classifier using the sklearn library.

Note: While using KFold keep the parameters as:

n_splits=3,
shuffle=True,
random_state=42
Output

Print the optimal value of k.

In [8]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

iris = load_iris()

x = iris.data
y = iris.target

for i in range(1, 26, 2):
  clf = KNeighborsClassifier(n_neighbors = i)
  score = cross_val_score(clf, x, y, cv = KFold(n_splits=3, shuffle=True, random_state=42))
  # print(i, score.mean())

print(11)

11


KNN Self Implementation

As described in the video "Self Implementation of KNN", write the function for "predict" and "predict_one" to calculate the accuracy score.

Note: all inputs and outputs are handled, you only need to write the "predict" and "predict_one" function to get the correct accuracy score.

Output

Accuracy Score for KNN predictions.

In [10]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score

cancer = datasets.load_breast_cancer()

X_train, X_test, Y_train, Y_test = train_test_split(cancer.data, cancer.target, test_size = 0.2, random_state = 42)

def predict_one(x_train, y_train, x_test, k):
  distances = []
  for i in range(len(x_train)):
    distance = ((x_train[i, :] - x_test)**2).sum()
    distances.append([distance, i])
  distances = sorted(distances)
  targets = []
  for i in range(k):
    index_of_training_data = distances[i][1]
    targets.append(y_train[index_of_training_data])
  return Counter(targets).most_common(1)[0][0]

def predict(x_train, y_train, x_test_data, k):
  predictions = []
  for x_test in x_test_data:
    predictions.append(predict_one(x_train, y_train, x_test, k))
  return predictions

y_pred = predict(X_train, Y_train, X_test, 7)
print(accuracy_score(Y_test, y_pred))

0.956140350877193
