In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) .will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data exploration

We need to explore the data to check if it's necessary some pre-processing step.

First we load the cvs file into a pandas dataframe.


In [None]:
df  = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

Then we use the head() method that return top n (5 by default) rows of a data frame. 

We can see the dataset has features with different ranges. 

In [None]:
df.head()

To check this diffence between the features range, we can use the describe() method, which will return statistics information like mean, std, min and max. 

In [None]:
df.describe()

We can see that the column "creatine_phosphokinase" has a range between [299, 7861], while the column "serum_creatinine" has a range between [1.39, 9.4]. With this information, we need to normalize the data, but first, what is normalization?

## Normalization

“Normalizing” a vector most often means dividing by a norm of the vector. It also often refers to rescaling by the minimum and range of the vector, to make all the elements lie between 0 and 1 thus bringing all the values of numeric columns in the dataset to a common scale.

Normalization is useful when your data has varying scales and the algorithm you are using does not make assumptions about the distribution of your data, such as k-nearest neighbors and artificial neural networks.

To normalize the data we will use the MinMaxScaler from sklearn library. 

In [None]:
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.preprocessing import MinMaxScaler #Import normalization library
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

train, teste = train_test_split(df_normalized, test_size=0.3, random_state=1)


Here we can see that the features range is between [0, 1]

In [None]:
df_normalized

In [None]:
df_normalized.describe()

After normalization, we can then start to build the knn model. First we need a function that calculate the  euclidean distance between two ponts/rows.

In [None]:
import math
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return math.sqrt(distance)

Then we need a function that find out the neighbors and their distances. 

We sorted the neighbors by their distance.


In [None]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

After that we build the function that it will return the predictions

In [None]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [None]:

def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return(predictions)

With k-nn distance algorithm and k = 1, we achieved an accuracy of 74.444%

In [None]:
num_neighbors = 1
predicion = k_nearest_neighbors(train.values, teste.values, num_neighbors)
print("Accuracy:",metrics.accuracy_score(teste['DEATH_EVENT'], predicion))