In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

**Dividing dataset into numeric and class dataset**

In [None]:
num_cols= ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
class_cols= ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']

In [None]:
plt.figure(figsize= (15,10))
sns.heatmap(df.corr(),annot= True, cmap= 'coolwarm')

**Checking the Distribution of Data**

In [None]:
df.hist(figsize=(20,20))
plt.show()

# Creating BoxPlot to check outliers in highly co-related data

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(4,2,1)
sns.boxplot(x= df['output'], y=df['cp'] )
plt.title('Chest Pain vs Output')

plt.subplot(4,2,2)
sns.boxplot(x= df['output'], y= df['thalachh'])
plt.title('Maximum Heart Rate vs Output')

plt.subplot(4,2,3)
sns.boxplot(x= df['output'], y= df['oldpeak'])
plt.title('Old Peak vs Output')

plt.subplot(4,2,4)
sns.boxplot(x= df['output'], y= df['exng'])
plt.title('exercise induced angina vs Output')

plt.subplot(4,2,5)
sns.boxplot(x= df['output'], y= df['sex'])
plt.title('Sex vs Output')

plt.subplot(4,2,6)
sns.boxplot(x= df['output'], y= df['age'])
plt.title('Age vs Output')

plt.subplot(4,2,7)
sns.boxplot(x= df['output'], y= df['trtbps'])
plt.title('Blood Pressure vs Output')

plt.subplot(4,2,8)
sns.boxplot(x= df['output'], y= df['slp'])
plt.title('Slope vs Output')



* Heart Attack is highly positively co-related to Chest Pain(cp)
* Heart Attack is positively co-related to Maximum Heart Rate achieved(thalachh)
* Heart Attack is negatively co-related to OldPeak and EXNG
* People between Age 50-70 more prone to heart attack
* People with chest pain type 0 have highest risk of heart attack
* People with cholestrol level 200-300 have very high risk
* People with max heart rate 150-175 have high risk
* People with thaal rate 2 and 3 have high risk

# Creating Models

In [None]:
X= df.drop('output', axis=1)
y= df['output']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, stratify=y, random_state= 100)

ss= StandardScaler()
X_train= ss.fit_transform(X_train)
X_test= ss.transform(X_test)

In [None]:
key= ['LogisticRegression', 'DecisionTreeRegressor', 'DecisionTreeClassifier',  'RandomForestClassifier', 'KNeighborsClassifier', ]

value= [LogisticRegression(), DecisionTreeRegressor() , DecisionTreeClassifier() ,  RandomForestClassifier() ,  KNeighborsClassifier() ]
models= dict(zip(key, value))

In [None]:
models

In [None]:
scores= []
for keys, value in models.items():
    score= -1*cross_val_score(value, X, y,  cv=5, scoring= 'neg_mean_absolute_error' )
    scores.append(score)
    print(value, score.mean())
    

In [None]:
accuracy_scores= []
for key, value in models.items():
    value.fit(X_train, y_train)
    y_pred= value.predict(X_test)
    accuracy= accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print(key, accuracy)

 **Logistic Regression and RandomForest Classifier are the best models to predict **

In [None]:
lr= LogisticRegression()

lr.fit(X_train, y_train)
y_pred= lr.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
rfc= RandomForestClassifier(random_state= 50)
rfc.fit(X_train, y_train)
y_pred= rfc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

**IT IS CLEAR THAT RANDOMFORESTCLASSIFIER IS BEST MODEL FOR THIS PROBLEM**

# Now we do some Hyperparameter Tuning

In [None]:
params= {'max_depth':np.arange(2,10,1),
        'n_estimators': [10,100,200],
        'max_features':[10,100,500],
        }

grid= GridSearchCV(rfc, param_grid= params, cv= 5)
grid.fit(X_train, y_train)


In [None]:
grid.best_params_

In [None]:
grid.best_estimator_.fit(X_train, y_train)
y_predcv= grid.best_estimator_.predict(X_test)

print(classification_report(y_test, y_predcv))
print(confusion_matrix(y_test, y_predcv))


# Hence, we get an accuracy of 87% on test set with RandomForestClassifier