# Heart Disease Classification problem

### In this project i will work on finding the classification model that best predicts wether a person will have a heart disease.

### The objective of this project is to improve my skills at developing machine learning functions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,classification_report
from pylab import rcParams
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

## Import data

In [None]:
data_raw = pd.read_csv("../input/heart-disease-uci/heart.csv")
data_raw.head()

# Variables of the dataset

#### - age
#### - sex
#### - chest pain type (4 values)
#### - resting blood pressure
#### - serum cholestoral in mg/dl
#### - fasting blood sugar > 120 mg/dl
#### - resting electrocardiographic results (values 0,1,2)
#### - maximum heart rate achieved
#### - exercise induced angina
#### - oldpeak = ST depression induced by exercise relative to rest
#### - the slope of the peak exercise ST segment
#### - number of major vessels (0-3) colored by flourosopy
#### - thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

## Quick data exploration

In [None]:
data_raw.info()

In [None]:
data_raw.shape

In [None]:
missing_values = data_raw.isnull().sum()
missing_values

### No nulls values where found.

In [None]:
data_raw.describe().T

In [None]:
def plot_correlation(data):
    '''
    plot correlation's matrix to explore dependency between features 
    '''
    # init figure size
    rcParams['figure.figsize'] = 16, 7 #Se puede obviar
    fig = plt.figure()
    sns.heatmap(data_raw.corr(), annot=True, fmt=".2f")
    plt.show()

# plot correlation & densities
plot_correlation(data_raw)

### Seeing the correlation graph we can see that there arent strong correlations within the variables. No value exceeds 0.5 or is less that -0.6.

In [None]:
df = data_raw

In [None]:
x = df.iloc[:, [0,12]].values
y = df["target"].values

# Accuracy and Best K Calculator

In [None]:
def acc_calculator(x,y):
    
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    acc = []
    
    from sklearn import metrics 
    
    for i in range(1, 50):
        knn = KNeighborsClassifier(n_neighbors = i)
        knn.fit(X_train, y_train)
        pred_i = knn.predict(X_test)
        acc.append(metrics.accuracy_score(y_test, pred_i))
        
    plt.figure(figsize = (10,6))
    plt.plot(range(1, 50), acc, color = 'blue', linestyle = 'dashed',
            marker = 'o', markerfacecolor = 'red', markersize = 10)
    plt.title('Accuracy vs K Value')
    plt.xlabel('K')
    plt.ylabel('Accuracy')
    print('Maximum accuracy:', max(acc), 'at K=', acc.index(max(acc)))

In [None]:
acc_calculator(x,y)

In [None]:
def k_calculator(x,y):
    
    x = df.iloc[:, [0,12]].values
    y = df["target"].values
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

    error_rate = []

    for i in range(1, 50):
        knn = KNeighborsClassifier(n_neighbors = i)
        knn.fit(X_train, y_train)
        pred_i = knn.predict(X_test)
        error_rate.append(np.mean(pred_i != y_test))
    
    plt.figure(figsize=(10,6))
    plt.plot(range(1, 50), error_rate, color = 'blue', linestyle = 'dashed',
            marker = 'o', markerfacecolor = 'red', markersize = 10)
    plt.title('Error Rate vs K Value')
    plt.xlabel('K')
    plt.ylabel('Error Rate')
    print('Minimum error:', min(error_rate), 'at K =', error_rate.index(min(error_rate)))

In [None]:
k_calculator(x,y)

# KNN Model

In [None]:
def KNN_model(data):
     
    '''
    KNN model 
    
    '''
    
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    model = KNeighborsClassifier(n_neighbors = 2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy_score(y_test, y_pred), cm

In [None]:
KNN_model(df)

# Others Classifiers


# SVM

In [None]:
def SVM_linearModel(data):
    '''
    SVM with linear kernel 
    
    '''
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    from sklearn import svm
    # instantiate learning model and fit data
    linealM = svm.SVC(kernel='linear')
    linealM.fit(X_train, y_train)
    
    # predict the response
    pred = linealM.predict(X_test)
    #confusion matrix
    cm_svm = confusion_matrix(y_test, pred)
    
    # evaluate and return  accuracy
    return accuracy_score(y_test, pred), cm_svm

In [None]:
SVM_linearModel(df)

# Decision Tree

In [None]:
def DecTree(data):
    
    '''
    Decision Tree Model 
    
    '''
    
    x = df.iloc[:, [0,12]].values
    y = df["target"].values
    
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
    
    
    from sklearn.tree import DecisionTreeClassifier
    
    model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy_score(y_test, y_pred), cm
    

In [None]:
DecTree(df)

# Conclusion

### Three machine learning models were used to determine which onw was best to predict wether a person would have a heart disease.

### With an accuracy of 77% de SVM model obtained the best result.