In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load the dataset
df=pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
print(df.head(5))


In [None]:
#explore the dataset
df.shape
df.info()
df.describe()

In [None]:
df.tail(10)

In [None]:
#check missing values
df.isnull().sum()

In [None]:
#no missing value in the dataset

In [None]:
#data visualization

#countplot for gender where female =0, male=1
sns.countplot(x='sex',data=df)
plt.title("Sex")
plt.show()

#countplot for exercise induced angina where 1=yes ,0=No
sns.countplot(x='exng',data=df)
plt.title('Exercise induced angina (exng)')
plt.show()

#countplot for chest pain type
sns.countplot(x='cp',data=df)
plt.title('Chest pain type')
plt.show()

#countplot for resting electrocardiographic results
sns.countplot(x='restecg',data=df)
plt.title('Resting electrocardiographic results')
plt.show()

#countplot for the target variable
sns.countplot(x='output',data=df)
plt.title('Having heart attack')
plt.show()

#histogram of age
sns.histplot(x='age',data=df)
plt.title('Distribution of age')
plt.show()



In [None]:
from sklearn.preprocessing import StandardScaler

#scale the needed scaling variables
scaler=StandardScaler()
df[['trtbps','chol','thalachh']]=scaler.fit_transform(df[['trtbps','chol','thalachh']])

#assign target variable to y and features to X
y=df['output']
X=df.drop('output',axis=1)
X.shape,y.shape




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression

#divide dataset to training and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

#model the data using different models and compare

#model the data using logistic regression
log_reg=LogisticRegression(max_iter=200)
log_reg.fit(X_train,y_train)

print('score of logistic regression model:',log_reg.score(X_test,y_test))
y_pred_log_reg=log_reg.predict(X_test)
confusion_matrix(y_test,y_pred_log_reg,normalize='true')

In [None]:
#Get the bet parameters to use for a classification tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
dt=DecisionTreeClassifier()
param_dt={'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6,7,8,9,10],'random_state':[21,42]}
grid_dt=GridSearchCV(estimator=dt,param_grid=param_dt)
print('best paramaeters to use for decition tree are:',grid_dt.fit(X_train,y_train).best_params_)


In [None]:
#build a classification tree model
dt1=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=21)
dt1.fit(X_train,y_train)
y_pred_dt=dt1.predict(X_test)
print('score of decision tree model:',dt1.score(X_train,y_train))
print('confusion matrix of decision tree model:',confusion_matrix(y_test,y_pred_dt,normalize='true'))

#decision tree has higher accuracy in predicting heart attacks than logistic regression

In [None]:
#get the best parameters for random forest model
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
param_rf={'n_estimators':[100,200,300,400,500],'criterion':['gini', 'entropy'],'max_depth':[2,3,4,5,6,7,8,9,10],'max_features':['auto', 'sqrt', 'log2']}
grid_rf=GridSearchCV(estimator=rf,param_grid=param_rf)
print(grid_rf.fit(X_train,y_train).best_params_)


In [None]:
#create random forest model with best parameters
rf1=RandomForestClassifier(n_estimators=300,criterion='entropy',max_depth=2,max_features='auto')
rf1.fit(X_train,y_train)
y_pred_rf=rf1.predict(X_test)
print('score of random forset model:',rf1.score(X_train,y_train))
print('confusion matrix of random forest model:',confusion_matrix(y_test,y_pred_rf,normalize='true'))

In [None]:
#random forest is the best model, with accuracy 86.8% and best performance according to confusion matrix