In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import os

In [None]:
os.chdir('/kaggle/input/heart-attack-analysis-prediction-dataset/')
df = pd.read_csv("heart.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True) 

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,15))
df.boxplot(vert=0)

In [None]:
print(df["fbs"].value_counts().sort_values())

In [None]:
df.corr(method = 'pearson')

In [None]:
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:

lr,ur=remove_outlier(df['chol'])
df['chol']=np.where(df['chol']>ur,ur,df['chol'])
df['chol']=np.where(df['chol']<lr,lr,df['chol'])
lr,ur=remove_outlier(df['trtbps'])
df['trtbps']=np.where(df['trtbps']>ur,ur,df['trtbps'])
df['trtbps']=np.where(df['trtbps']<lr,lr,df['trtbps'])

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15,15))
df.boxplot(vert=0)

In [None]:
sns.distplot(df['age'])
plt.show()
sns.distplot(df['sex'])
plt.show()
sns.distplot(df['cp'])
plt.show()
sns.distplot(df['trtbps'])
plt.show()
sns.distplot(df['chol'])
plt.show()
sns.distplot(df['fbs'])
plt.show()
sns.distplot(df['restecg'])
plt.show()
sns.distplot(df['thalachh'])
plt.show()
sns.distplot(df['exng'])
plt.show()
sns.distplot(df['oldpeak'])
plt.show()
sns.distplot(df['slp'])
plt.show()
sns.distplot(df['caa'])
plt.show()
sns.distplot(df['thall'])
plt.show()


In [None]:
sns.pairplot(df,diag_kind = 'kde')

In [None]:
df.corr(method = 'pearson')

In [None]:
print(df["fbs"].value_counts().sort_values())

In [None]:
fig,ax = plt.subplots(figsize=(15,15))   
sns.heatmap(df.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma") 
plt.show()

In [None]:
df.output.value_counts(normalize=True)

In [None]:
X = df.drop("output", axis=1)

y = df.pop("output")

X.head()

In [None]:
y.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , confusion_matrix,accuracy_score

In [None]:
# splitting data into training and test set for independent attributes
from sklearn.model_selection import train_test_split

X_train, X_test, train_labels, test_labels = train_test_split(X, y, test_size=.30, random_state=1)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [4,5,6],
    'max_features': [4,5,6],
    'min_samples_leaf': [5,7],
    'min_samples_split': [80,90],
    'n_estimators': [100,150]
}

rfcl = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rfcl, param_grid = param_grid, cv = 3)

In [None]:
grid_search.fit(X_train, train_labels)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_

In [None]:
print(pd.DataFrame(best_grid.feature_importances_, columns = ["Imp"], index = X_train.columns).sort_values('Imp',ascending=False))

In [None]:
ytrain_predict = best_grid.predict(X_train)
ytest_predict = best_grid.predict(X_test)

In [None]:
print(classification_report(train_labels,ytrain_predict))

In [None]:
print('Accuracy Score is',round(accuracy_score(train_labels, ytrain_predict),2)*100,'%')

In [None]:
confusion_matrix(train_labels,ytrain_predict)

In [None]:
confusion_matrix(train_labels,ytrain_predict)
sns.heatmap(confusion_matrix(train_labels,ytrain_predict),annot=True, fmt='d',cbar=False, cmap='magma')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import roc_curve,roc_auc_score
rf_fpr, rf_tpr,_=roc_curve(train_labels,best_grid.predict_proba(X_train)[:,1])
plt.figure(figsize=(12,7))
plt.plot(rf_fpr,rf_tpr, marker='x', label='Random Forest')
plt.plot(np.arange(0,1.1,0.1),np.arange(0,1.1,0.1))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()
print('Area under Curve is', roc_auc_score(train_labels,best_grid.predict_proba(X_train)[:,1]))

In [None]:
print(classification_report(test_labels,ytest_predict))

In [None]:
print('Accuracy Score is',round(accuracy_score(test_labels,ytest_predict),2)*100,'%')

In [None]:
confusion_matrix(test_labels,ytest_predict)
sns.heatmap(confusion_matrix(test_labels,ytest_predict),annot=True, fmt='d',cbar=False, cmap='magma')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#from sklearn.metrics import roc_curve,roc_auc_score
rf_fpr, rf_tpr,_=roc_curve(test_labels,best_grid.predict_proba(X_test)[:,1])
plt.figure(figsize=(12,7))
plt.plot(rf_fpr,rf_tpr, marker='x', label='Random Forest')
plt.plot(np.arange(0,1.1,0.1),np.arange(0,1.1,0.1))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()
print('Area under Curve is', roc_auc_score(test_labels,best_grid.predict_proba(X_test)[:,1]))