In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#load the heart attack dataset and view the features       
file_path_name = "../input/heart-disease-uci/heart.csv" 
heart_data = pd.read_csv(file_path_name)
heart_data.head()


In [None]:
#check for null values and proceed with next steps if there are no null values
heart_data.isnull().sum()

In [None]:
#change the feature names to easily understable names
heart_data.columns= ['Age','Sex','Chest Pain Type','Rest BP','Cholestrol','FBS','RestECG','Max Heart Rate','Exer Angina','Prev Peak','Slope','No of Major Vessels','Thal Rate','Target']

#see the data type and value details of all variable
heart_data.info()
heart_data.head()

Initial peek into the data shows that there aare no missing values or null values. It looks clean and we can proceed with the next steps.


In [None]:
#check the correlation
plt.figure(figsize=(12,10))
sns.heatmap(heart_data.corr(),annot=True,square=True)
plt.show()

As we can see from the above correlation map, the below 3 features have some positive relation with heart attack (the target).
1. Chest Pain Type, 
2. Maximum Heart rate
3. Slope 

And the below features have negative relation with heart attack. (the target)
 1. Excercize Angina
 2. Previous Peak
 3. No of Major vessels
 4. Thal rate

Draw and Visualize the relation between features and the target

In [None]:
#draw box plot to get some insight on the relationship of features 
plt.figure(figsize=(12,8))
sns.boxplot(x=heart_data['Target'],y=heart_data['Age'])
plt.show()

In [None]:
#visualize the relationship between Chest Pain type and chance of a heart attack
plt.figure(figsize=(20,8))
sns.boxplot(x=heart_data['Target'], y = heart_data['Chest Pain Type'])
plt.show()

In [None]:
#visualize the relationship between Maximum heart rate and chance of a heart attack
plt.figure(figsize=(20,8))
sns.boxplot(x=heart_data['Target'], y = heart_data['Max Heart Rate'])
plt.show()

In [None]:
#visualize the relationship between Slope and chance of a heart attack
plt.figure(figsize=(20,8))
sns.boxplot(x=heart_data['Target'], y=heart_data['Slope'])
plt.show()

In [None]:
#plot histogram for each feature to get some insight 
heart_data.hist(figsize=(12,8))
plt.show()

From the above we can see some observations.
1. People about the age of 60 have more risk of having a heart attack compare to other age group.
2. People with sex = 1 have double the risk of having a heart attack when compare to people with sex = 0.
3. People with Chest pain type 0 have higest risk of heart attack followed by type 2
4. People with resting BP of between 130-140 have higher risk of having heart attack
5.People with Cholesterol in the range 210-260 have higest risk of having a heart attack
6. FBS seem not to effect heart attack
7. People with maximum heart rate in the range 150-175 have higher risk of heart attack
8.Exercise Angina has no effect on heart attack
9.People with Thaal rate 2 and 3 have higher risk
10. All other features donot have any direct impact with hear attack

Preparation work and build Models

In [None]:
#Prepare and split training and test data
X = heart_data.drop('Target', axis=1)
y = heart_data['Target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

Build and test the performance of different Regression Models.


In [None]:
#Try different Regression  models
lr = LinearRegression().fit(X_train,y_train)
lgr = LogisticRegression(solver='liblinear').fit(X_train,y_train)
rfr = RandomForestRegressor(n_estimators = 100, random_state=0, verbose=False).fit(X_train,y_train)
dtr = DecisionTreeRegressor(random_state=0).fit(X_train, y_train)
xgbr = XGBRegressor().fit(X_train,y_train)

In [None]:
#get MAE for these regression models that we have already fit 
models=[lr,lgr,rfr,dtr,xgbr]
for model in models:
    scores = -1 * cross_val_score(model, X, y, cv = 10, scoring ='neg_mean_absolute_error')
    print("MAE Score:\n", scores.mean())    

From the above MAE values, we can conclude that the LogisticRegression model performs better than the other regression models. 

Build and test performance of differenrt Classification Models

In [None]:
#Try different Classifier models
knn = KNeighborsClassifier().fit(X_train,y_train)
dtc = DecisionTreeClassifier(random_state=0).fit(X_train,y_train)
rfc = RandomForestClassifier(random_state=0, verbose=False).fit(X_train, y_train)
xgbc = XGBClassifier().fit(X_train,y_train)


In [None]:
models=[knn,dtc,rfc,xgbc]
for model in models:
    scores = -1 * cross_val_score(model,X,y,cv=10, scoring='neg_mean_absolute_error')
    print("MAE Score is:\n", scores.mean())
    
print(scores)    

In the above classifier and regressor models, **RandomForestClassifier**  and **LogisticRegression** models look to have lower MAE. Let us look at details of their other performence measure metrics like Precision, Recall and f1-Score

In [None]:
rfc_pred = rfc.predict(X_test)
scores = -1*cross_val_score(rfc, X,y, cv = 10, scoring ='neg_mean_absolute_error')
print("MAE is:", scores.mean())
print("Accuracy score is: ", accuracy_score(y_test,rfc_pred))
print("Classification Report:\n", classification_report(y_test,rfc_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,rfc_pred))

**RandomForestClassifier** model has low MAE and fairly good accuracy of 88%. It has good scores for Precision, Recall and F1 Score as well. The confusion matrix values shows good prediction results of the model too. 

In [None]:
#fit, train and prdict using logistic regression model and check accuracy
lr = LogisticRegression().fit(X_train,y_train)

y_pred = lr.predict(X_test)

#calculate accuracy of the fitted model
acc_score = accuracy_score(y_test,y_pred)

#check the mean absolute error 
score = -1* cross_val_score(lr,X,y,cv=8, scoring='neg_mean_absolute_error')

print("Mean Absolute error is:", score.mean())
print("The accuracy score of Logistric Regression Classifier is:", acc_score)

#lets look at precision, recall, f1_score to check models performance
print("Classification Report:\n",classification_report(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))

**Logistic Regression** model has the accuracy of about 85%. Looking at precision, recall and F1 score will give more insight on how well the model is performing. Precision of 0.85/0.86 and recall of 0.81/0.88 and f1_score of 0.83/0.87 shows that the models performance is fairly good. Confusion matrix also shows results that supports the good performance of the above model.

From the above performance measures, we can see that **RandomForstClassifier** is better performed when compared to **LogisticRegression** model. We can conclude that the **RandomForestClassifier** model seems to be best model for our heart attack data prediction. 

My humble request to all the experts/participants  in this group- please give me your feedback and suggestions so that I can improve my work and learn from you all. 
Please upvote my work if you like it. 
Thank you!!!