In [63]:
# Import packages
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV

In [64]:
df = pd.read_csv('../data/preprocessed_data.csv') 

In [65]:
label = df['HeartDisease']
data = df.drop('HeartDisease', axis=1) 

In [66]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2)

In [67]:
knnmodel = KNeighborsClassifier() #using default of 5 for now 
knnmodel.fit(X_train, y_train)  

In [68]:
knnmodel.score(X_test, y_test) 

0.6684782608695652

## Grid Search

In [69]:
param_grid = {'n_neighbors': [2, 3, 5, 10, 15, 20, 50],
              'metric': ['euclidean', 'manhattan', 'minkowski']} # Default is minkowski. 

In [70]:
gridsearch = GridSearchCV(estimator=knnmodel, param_grid=param_grid)
gridsearch.fit(X_train, y_train) 

In [71]:
gridsearch.best_params_

{'metric': 'manhattan', 'n_neighbors': 5}

In [72]:
gridsearch.best_estimator_.score(X_test, y_test)

0.7608695652173914

## Using only highly correlated features

In [73]:
#correlation is calculated here to extract features with high correlation
correlation = df.corr()['HeartDisease'].sort_values(ascending=False)

#features with correlation higher than 0.25 with heart disease is done
high_corr_features = correlation[abs(correlation) > 0.25].index.tolist()
high_corr_features.remove('HeartDisease')

In [74]:
# feature matrix for X and Y is selected here
X = df[high_corr_features]
y = df['HeartDisease'] 

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
knnmodel = KNeighborsClassifier() #using default of 5 for now 
knnmodel.fit(X_train, y_train)  

In [76]:
knnmodel.score(X_test, y_test) 

0.8315217391304348

Using highly correlated features results in a better performance.

## Try out PCA

In [77]:
from sklearn.decomposition import PCA

If n_components == 'mle' and svd_solver == 'full', Minka’s MLE is used to guess the dimension. Use of n_components == 'mle' will interpret svd_solver == 'auto' as svd_solver == 'full'.

In [78]:
pca = PCA(n_components='mle') 

In [79]:
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test) 

In [80]:
X_train_reduced.shape, X_test_reduced.shape

((734, 10), (184, 10))

In [81]:
X_train.shape, X_test.shape

((734, 13), (184, 13))

In [82]:
knnmodel.fit(X_train_reduced, y_train)

In [83]:
knnmodel.score(X_test_reduced, y_test) 

0.8315217391304348