In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Content : 
1. [Read Data](#1)
1. [EXPLORATORY DATA ANALYSIS](#2)
1. [Visualization](#3)
1. [Data Preproccesing](#4)
1. [ KNN Model ](#5)


<a id="1"></a>

# 1. Read Data

In [None]:
data= pd.read_csv('../input/biomechanical-features-of-orthopedic-patients/column_2C_weka.csv')

<a id="2"></a>
# 2. EXPLORATORY DATA ANALYSIS

Let's start with Exploratory Data Analysis to learn about data.

In [None]:
# With head, we can look at the first five rows of data 

data.head()

In [None]:
# We can see that our data has x,y columns
# Now, we need to check any NaN values

data.info()

**we have no null values**

In [None]:
data.describe()

In [None]:
f,ax = plt.subplots(figsize=(12, 12))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax)
plt.title("Corrolation Map")
plt.show()

<a id="3"></a>
# 3. Visualization

In [None]:
sns.countplot(x="class", data=data)
data.loc[:,'class'].value_counts()

In [None]:
plt.scatter(data[data['class']=='Normal'].pelvic_incidence,data[data['class']=='Normal'].pelvic_radius,color='green',label='Normal')
plt.scatter(data[data['class']=='Abnormal'].pelvic_incidence,data[data['class']=='Abnormal'].pelvic_radius,color='red',label='AbNormal')
plt.xlabel('Pelvic incidence')
plt.ylabel('Pelvic Radius')
plt.legend()
plt.show()


In [None]:
sns.pairplot(data=data,hue='class',diag_kind="hist")
plt.show()

<a id="4"></a>
# 4. Data Preproccesing

In [None]:
data['class']=[1 if i=='Normal' else 0 for i in data['class']]
y=data['class'].values
sns.countplot(x="class", data=data)
y

In [None]:
x_nanorm=data.drop(['class'],axis=1)
x_nanorm.head()

### 4.1 Normalization

In [None]:
x=(x_nanorm-np.min(x_nanorm))/(np.max(x_nanorm)-np.min(x_nanorm)).values
x

### 4.2 Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=8)

In [None]:
print('Train data size :' ,len(x_train),'\nTest Data size :', len(y_test),'\nTotal Data size : ',(len(x_train)+len(y_test)))

<a id="5"></a>

# 5. KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
print(" {} nn score: {} ".format(3,knn.score(x_test,y_test)))

In [None]:
score_list = []

for i in range(1,20):
    knn_n = KNeighborsClassifier(n_neighbors = i)
    knn_n.fit(x_train, y_train)
    score_list.append(knn_n.score(x_test, y_test))

plt.plot(range(1,20),score_list)
plt.xlabel('K values')
plt.ylabel('accuracy')
plt.show()

In [None]:
optimal_k=score_list.index(max(score_list))+1
knn=KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(x_train,y_train)
print(" {} nn score: {} ".format(optimal_k,knn.score(x_test,y_test)))