In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Objective
### To predict whether a patient is at risk for a heart attack. This is a binary outcome.

Positive (+) = 1, patient is at risk
Negative (-) = 0, patient is not at risk

# Understanding the Dataset
* age (#)
* sex : 1 = Male, 0 = Female (Binary)
* (cp) chest pain [type (4 values, Ordinal)]: 1: typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic
* (trestbps) resting blood pressure (#)
* (chol) serum cholestoral in mg/dl (#)
* (fbs) fasting blood sugar > 120 mg/dl (Binary) [1 = true; 0 = false]
* (restecg) resting electrocardiographic results [values 0,1,2]
* (thalach) maximum heart rate achieved (#)
* (exang) exercise induced angina (Binary) [1 = yes; 0 = no]
* (oldpeak) = ST depression induced by exercise relative to rest (#)
* (slope) of the peak exercise ST segment (Ordinal) [ 1: upsloping, 2: flat , 3: downsloping)
* (ca) number of major vessels (0-3, Ordinal) colored by fluoroscopy
* (thal) maximum heart rate achieved (Ordinal) [3 = normal; 6 = fixed defect; 7 = reversable defect]

# Exploring the Dataset 

In [None]:
data=pd.read_csv('/kaggle/input/heart-attack-prediction/data.csv')
data.shape

In [None]:
data.dtypes

## visualization of Correlation in Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True);

**OldPeak**(ST depression induced by exercise relative to rest) and **CP** ( Chest Pain Type ) have the most correlation with **target** ( diagnosis of heart disease)

In [None]:

for col in data.columns:
    if 'num' in col:
        continue
    print(col, '\n------------\n') 
    print("Unknown % = {}".format(len(data[data[col] == '?'])/ len(data)))
    print("Median: {}".format(data[data[col] != '?'][col].median()))
    print("Mean: {}".format(data[data[col] != '?'][col].mean()))
temp = data.drop(['ca', 'thal', 'slope'], axis=1)
temp.head()

In [None]:
for col in temp.columns:
    print(col, '\n----------------\n',temp[temp[col] == '?'], '\n-----------------------\n')
# print(temp[temp != '?'])

# Preprocessing the Data

In [None]:
data.rename(columns={'num       ': 'target'}, inplace=True) 
data.drop(['ca', 'thal', 'slope'], axis=1, inplace=True)
#Deleting outliers for now
data.drop(index=[2, 31, 34, 44, 65, 72, 75, 86, 91, 97, 101, 102, 108, 124, 134, 154, 168, 182, 226, 239, 244, 275, 278, 27, 81, 107, 131, 144, 166, 197, 199, 260, 90] , inplace=True)

# for col in data.columns:
#     data.drop(index=data[data[col] == '?'], inplace=True)
# data['chol']=data['chol'].replace('?', data[data['chol'] != '?']['chol'].median())
# data=data.replace('?',None)
# data=data.replace('?',0)

data.head()

In [None]:
#Getting Pandas Dummies for ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

#data = pd.get_dummies(data, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope','thal','ca'])
data = pd.get_dummies(data, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang'])

In [None]:
#Scaling the other attributes using normal scaler

standardScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    
data[columns_to_scale] = standardScaler.fit_transform(data[columns_to_scale])
scaling_values = {}
computed_scaling_values = [standardScaler.mean_, np.sqrt(standardScaler.var_)]
for idx, col in enumerate(columns_to_scale):
    scaling_values[col] = {'mean': computed_scaling_values[0][idx], 'std': computed_scaling_values[1][idx]}
scaling_values

# Final Preprocessed Data

In [None]:
data.head()

# Splitting data as Train and Test

In [None]:
y = data['target']
X = data.drop('target',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

#80% Train and 20% Test Data

In [None]:
#Plotting the Target (Heart Disease)
plt.figure(figsize=(6,4))
sns.countplot(y)
plt.show()

In [None]:
#Analysing the shape of X_train and X_test Data

print(X_train.shape)
print(X_test.shape)

 # MODELS

     1) SVM
     2) Random Forest
     3) Logistic Regression
     4) Multi-layer Perceptron classifier 
     5) Extra Trees

# 1) Using SVM

## Running SVM model with Various Kernals

In [None]:
from sklearn.svm import SVC

#Function for storing model scores using various kernals
svc_scores = []
kernel_type = ['linear', 'poly', 'rbf', 'sigmoid']
for type in kernel_type:
    svc_classifier = SVC(kernel = type)
    svc_classifier.fit(X_train, y_train)
    svc_scores.append(svc_classifier.score(X_test, y_test))

In [None]:
#Plotting the accuracy

for i in range(len(kernel_type)):
    label = round(svc_scores[i], 5)
    plt.text(i, svc_scores[i], label)
plt.xlabel('Kernels')
plt.ylabel('Scores')
plt.title('Support Vector Classifier scores for different kernels')
plt.bar(kernel_type, svc_scores)

We can see that the **rbf** kernel gives the maximum accuracy. Training the final model in rbf

In [None]:
#Training the model on 'rbf' Kernal

svc =  SVC(kernel='linear')
svc.fit(X_train, y_train)
svc_predicted = svc.predict(X_test)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)
svc_acc_score = accuracy_score(y_test, svc_predicted)

#Printing the confussion matrix and accuracy scores
print("confussion matrix")
print(svc_conf_matrix)
print(classification_report(y_test, svc_predicted))
print("\n")
print("Accuracy of Support Vector Classifier: {:.3f}".format(svc_acc_score*100),'%\n')

# 2) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(n_estimators=500)
model.fit(X_train,y_train)
rfpred=model.predict(X_test)
RF_conf_matrix = confusion_matrix(y_test, rfpred)
rf_acc_score = accuracy_score(y_test, rfpred)

#Printing the confussion matrix and accuracy scores
print("confussion matrix")
print(RF_conf_matrix)
print(classification_report(y_test, rfpred))
print("\n")
print("Accuracy of Random Forest Classifier: {:.3f}".format(rf_acc_score*100),'%\n')

# 3) Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)

#Printing the confussion matrix and accuracy scores
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print(classification_report(y_test,lr_predict))
print("Accuracy of Logistic Regression: {:.3f}".format(lr_acc_score*100),'%\n')

# 4) Multi-layer Perceptron classifier 

In [None]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(hidden_layer_sizes=(32), learning_rate_init=0.001, max_iter=1000)
model = MLP.fit(X_train, y_train)
MLP_predict = MLP.predict(X_test)
MLP_conf_matrix = confusion_matrix(y_test, MLP_predict)
MLP_acc_score = accuracy_score(y_test, MLP_predict)


#Printing the confussion matrix and accuracy scoresprint("confussion matrix")
print(MLP_conf_matrix)
print("\n")
print(classification_report(y_test,MLP_predict))
print("Accuracy of Multilayer Perceptron classifier: {:.3f}".format(MLP_acc_score*100),'%\n')

# 5) Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier


model=ExtraTreesClassifier(n_estimators=100,random_state=1)
model.fit(X_train,y_train)
etpred=model.predict(X_test)
ET_conf_matrix = confusion_matrix(y_test, etpred)
et_acc_score = accuracy_score(y_test, etpred)

#Printing the confussion matrix and accuracy scores
print("confussion matrix")
print(ET_conf_matrix)
print(classification_report(y_test, etpred))
print("\n")
print("Accuracy of Extra Trees Classifier: {:.3f}".format(et_acc_score*100),'%\n')

# Top Scorers on non CV'ed 

    1) SVM - 86%
    2) Logistic - 84.9%
    3) RF - 81%

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
def cv_model(classifier, splits=10):
    accuracy = []
    skf = StratifiedKFold(n_splits=splits)
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
        classifier.fit(X_train, y_train)
        model_prediction = classifier.predict(X_test)
        conf_matrix = confusion_matrix(y_test, model_prediction)
        acc_score = accuracy_score(y_test, model_prediction)
        accuracy.append(acc_score)
    print('Accuracy:\n', accuracy)
    print('Average Accuracy:', np.mean(accuracy))

In [None]:
kernel_type = ['linear', 'poly', 'rbf', 'sigmoid']
for t in kernel_type:
    print("Kernel: ", t)
    cv_model(SVC(kernel=t), 10)
    print('\n------------------\n')

In [None]:
cv_model(LogisticRegression(), 10)

In [None]:
cv_model(RandomForestClassifier(n_estimators=800), 10)

In [None]:
cv_model( MLPClassifier(hidden_layer_sizes=(32), learning_rate_init=0.001, max_iter=10000))

In [None]:
cv_model(ExtraTreesClassifier(n_estimators=500,random_state=1))

In [None]:
def cv_model_get_best_fit(classifier, splits=10):
    accuracy = []
    batch = []
    skf = StratifiedKFold(n_splits=splits)
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test, y_train, y_test = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
        batch.append([X_train, X_test, y_train, y_test])
        classifier.fit(X_train, y_train)
        model_prediction = classifier.predict(X_test)
        conf_matrix = confusion_matrix(y_test, model_prediction)
        acc_score = accuracy_score(y_test, model_prediction)
        accuracy.append(acc_score)
    print('Accuracy:\n', accuracy)
    print('Average Accuracy:', np.mean(accuracy))
    print('Max Accuracy:{} at {} '.format(np.max(accuracy), np.argmax(accuracy)))
    return batch[np.argmax(accuracy)]

In [None]:
final_model = SVC(kernel='linear')
X_train, X_test, y_train, y_test = cv_model_get_best_fit(final_model)

final_model.fit(X_train, y_train)

In [None]:
[X_test.iloc[20], '===================',y_test.iloc[20]]

In [None]:
final_model.predict(X_test.iloc[19].values.reshape(1, 19))

In [None]:
import joblib
joblib.dump(final_model, 'model.sav')