# Contents:
   * [Read Data](#1)
   * [Outlier Detection](#2)
   * [Normalization](#3)
   * [Train Test Split](#4)
   * [KNN Model](#5)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from collections import Counter

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1"><a><br>
# Read Data

In [None]:
data = pd.read_csv("../input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv")
data["class"] = [1 if each == "Abnormal" else 0 for each in data["class"]]

In [None]:
data.info()

In [None]:
data.describe()

<a id = "2"><a><br>
# Outlier Detection

In [None]:
def detect_outliers(data,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(data[c],25)
        # 3rd quartile
        Q3 = np.percentile(data[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # Detect outlier and their indices
        outlier_list_col = data[(data[c] < Q1 - outlier_step) | (data[c] > Q3 + outlier_step)].index
        # Store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i,v in outlier_indices.items() if v > 2)
        
    return multiple_outliers

columns = data.iloc[:,0:]
outliers = data.loc[detect_outliers(data,columns)]

In [None]:
outliers

## Drop Outliers

In [None]:
data = data.drop(detect_outliers(data,columns),axis = 0).reset_index(drop = True)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
abnormal = data[data["class"] == 1]
normal = data[data["class"] == 0]

In [None]:
# scatter plot
plt.scatter(abnormal.pelvic_incidence,abnormal.pelvic_radius,color="red",label = "Abnormal",alpha = 0.6)
plt.scatter(normal.pelvic_incidence,normal.pelvic_radius,color="green",label = "Normal",alpha = 0.5)
plt.xlabel("pelvic?incidence")
plt.ylabel("pelvic_tilt")
plt.show()

In [None]:
y = data["class"].values
x_data = data.drop(["class"],axis=1)

In [None]:
y

In [None]:
x_data.head()

<a id="3"><a><br>
# Normalization

In [None]:
x = (x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))

<a id="4"><a><br>
# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

<a id="5"><a><br>
# KNN Model:
    1. Pick a value for K.
    2. Search for the K observations in the training data that are "nearest" to the measurements of the unknown iris
    3. Use the most popular response value from the K nearest neighbors as the predicted response value for the unknown iris
* This would always have 100% accuracy, because we are testing on the exact same data, it would always make correct predictions
* KNN would search for one nearest observation and find that exact same observation
    * KNN has memorized the training set
    * Because we testing on the exact same data, it would always make the same prediction


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(x_train,y_train)
pred = knn.predict(x_test)

In [None]:
pred

In [None]:
print("{} nn score:{}".format(1,knn.score(x_test,y_test)))

I said an estimated 1. Here, a method can be used to find the n_neighbors value.

In [None]:
# find k value

score_list = []
for i in range(1,15):
    knn2 = KNeighborsClassifier(n_neighbors = i)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))
    
plt.plot(range(1,15),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()

As can be seen, when our k value is 1, our accuracy rate is maximum.