# KNN Algorithm
* [Import and Visualize Dataset](#1)
* [KNN with sklearn](#2)
* [Conclusion](#3)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<a id=1> </a>
## Import & Visualize Dataset

In [None]:
weka_2C = pd.read_csv("../input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")
weka_3C = pd.read_csv("../input/biomechanical-features-of-orthopedic-patients/column_3C_weka.csv")

In [None]:
weka_2C.head(10)

In [None]:
weka_3C.head(10)

In [None]:
#Checking any null cells
weka_2C.isnull().sum()

In [None]:
weka_3C.isnull().sum()

In [None]:
weka_3C["class"].value_counts()

In [None]:
weka_2C["class"].value_counts()

In [None]:
weka_2C.describe()

In [None]:
weka_2C.corr()

In [None]:
#split weka_2C dataset into two pieces for A and N
A = weka_2C[weka_2C["class"] == "Abnormal"]
N = weka_2C[weka_2C["class"] == "Normal"]

In [None]:
#scatter plot
plt.scatter(A["degree_spondylolisthesis"], A["sacral_slope"], color="red", label="abnormal", alpha=0.2)
plt.scatter(N["degree_spondylolisthesis"], N["sacral_slope"], color="blue", label="normal", alpha=0.2)
plt.xlabel("Degree Spondylolisthesis")
plt.ylabel("Sacral Slope")
plt.legend()
plt.show()

In [None]:
#scatter plot
plt.scatter(A["lumbar_lordosis_angle"], A["sacral_slope"], color="red", label="abnormal", alpha=0.2)
plt.scatter(N["lumbar_lordosis_angle"], N["sacral_slope"], color="blue", label="normal", alpha=0.2)
plt.xlabel("Lumbar Lordosis Angle")
plt.ylabel("Sacral Slope")
plt.legend()
plt.show()

<a id=2></a>
## KNN with sklearn

In [None]:
#Using KNN
#First we need to convert Anormal and Normal values into 0 and 1. 0 for Anormal, 1 for Normal
weka_2C["class"] = [1 if each == "Normal" else 0 for each in weka_2C["class"]]
#Let's create y and X_data values
y = weka_2C["class"].values 
X_data = weka_2C.drop(["class"],axis=1) 

In [None]:
#Normalization
X = (X_data - np.min(X_data))/(np.max(X_data) - np.min(X_data))

In [None]:
#train test split -- 30% of data is used for test, 70% of data is used for train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=34)

In [None]:
#KNN model
from sklearn.neighbors import KNeighborsClassifier as KNC
knn = KNC(n_neighbors = 3) #n_neighbors--- k value
#Since k value is a hyperparameter, it needs fine tuning. So, this time, let's say k=3. And then we can search better k values for better score
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)

In [None]:
#let's check this prediction array
prediction 

In [None]:
print("{}-nn score: {}". format(3,knn.score(X_test,y_test)))

In [None]:
#find another k value
score_list = [] #To store score values
for each in range(1,18):
    knn2 = KNC(n_neighbors = each)
    knn2.fit(X_train,y_train)
    score_list.append(knn2.score(X_test,y_test))

#Score-k value plot
plt.plot(score_list)
plt.xlabel("k values")
plt.ylabel("score")
plt.show()

In [None]:
#KNN model-2
from sklearn.neighbors import KNeighborsClassifier as KNC
knn = KNC(n_neighbors = 6) 
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
print("{}-nn score: {}". format(6,knn.score(X_test,y_test)))

In [None]:
#KNN model-3
from sklearn.neighbors import KNeighborsClassifier as KNC
knn = KNC(n_neighbors = 14) 
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
print("{}-nn score: {}". format(14,knn.score(X_test,y_test)))

<a id=3></a>
## Conclusion

According to score-k values plot, some k values give better scores. For example, by selecting k=14, score will be 81,7%.  