In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pylab as pl
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline 
import matplotlib.pyplot as plt


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Import Data

In [None]:
df = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

# 2. Visualization

### Check Tableau repo at: https://public.tableau.com/app/profile/saadeddine.loughzali/viz/HeartAttackProject-Analysis/CasesSummary

In [None]:
df

# 3. Pre-processing

### 3.1 Convert Categorical features to numerical values

In [None]:
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
output_col = ["output"]

### 3.2 One Hot Encoding

In [None]:
dfc = df

dfc = pd.get_dummies(dfc, columns = cat_cols, drop_first = True)

X = dfc.drop(['output'],axis=1)
y = dfc[['output']]

### 3.3 Feature Selection

### 3.4 Normalize Data

Data Standardization give data zero mean and unit variance (technically should be done after train test split)

# 4. Classification

let's build an accurate model. Then use the test set to report the accuracy of the model
let's try the following algorithms:

*   K Nearest Neighbor(KNN)
*   Decision Tree
*   Support Vector Machine
*   Logistic Regression

### Train / Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

## 4.1. Logistic Regression

### 4.1.1 Modeling and Predicting

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

In [None]:
yhat = LR.predict(X_test)
yhat

### 4.1.2 Metrics

In [None]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, yhat,pos_label=0)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
print(confusion_matrix(y_test, yhat, labels=[1,0]))

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)


# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['output=1','output=0'],normalize= False,  title='Confusion matrix')

## 4.2 K Nearest Neighbor(KNN)


### 4.2.1 Modeling, Predicting and metrics

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
k = 4
#Train Model and Predict  
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh

In [None]:
yhat = neigh.predict(X_test)
yhat

In [None]:
Ks = 50
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train.values.ravel())
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    yhat=yhat.reshape(61,1)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

#Best results for K=11 then K= 12 13 14 then K=6

## 4.3 Decision Tree

### 4.1.1 Modeling and Predicting

In [None]:
from sklearn.tree import DecisionTreeClassifier
#X_train, X_test, y_train, y_test

In [None]:
haTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
haTree # it shows the default parameters

In [None]:
haTree.fit(X_train,y_train)

In [None]:
predTree = haTree.predict(X_test)

In [None]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

## 4.4 Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
clf = SVC(kernel='linear', C=1, random_state=42).fit(X_train,y_train)

y_pred = clf.predict(X_test)

print("SVM accuracy score: ", accuracy_score(y_test, y_pred))

# Conclusion

## Highest scores:
#### Logistic Regression: 0.72
#### KNN: 0,75 K=11
#### Decision Tree: 0,67
#### SVM: 0,85