In [None]:
#Importing required libraries
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from xgboost import XGBClassifier
import gc
import warnings
warnings.filterwarnings("ignore")

In [None]:
#reading Dataset
df=pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
df

In [None]:
print(df.shape)
df.info()
#data is numeric

In [None]:
df.describe()

In [None]:
#cheking null values
print(df.isnull().values.any())
df.isnull().sum()
#no null values present in the data set

**Understanding the data by visualizing**



In [None]:
plt.tight_layout()
#univariate analysis
for i in df.columns:
    plt.figure(figsize=(10,5))
    sns.countplot(df[i])
    print(i,df[i].unique())
    print(df[i].value_counts().sort_index())
    print("----------------------------------------------------------------------")

*****Observations:*****
* Different age group people are there in given data set
* Assuming 0 as female and 1 as male and fact says that males do have more chances       of heartattack,so in this data set label 1 is considered as males and vice versa.
* More count of people with type 1 Chest pain and very few with type 4 can be observed in the above visualization
* As per my research,normal bp is considered to be upto 80/120 mm Hg but there are many people who have bp more than 140 mm Hg and one person even have bp upto 200 mm Hg which is too high 
* As per my research,Cholestrol level below 200 mg/dl  is considered to be normal and between 200 and 239  is considered to be borderline and more than 240  mg/dl is considered to be high cholestrol level,we can observe in above visulaization one person is having as high 564 mg/dl.
* Very few people have diabetes ,it is clear from above visulization
* We can observe that among people who have done ECG,more of them were having ST-T wave abnormality 
* Normal Heart rate or pulse rate is b/w 50-100 but in visualization above we can observe a person was having pulse rate as high as 202
* In above visulisation ,it is observed that less people had angima(chest pain caused by reduced blood flow to heart)
* oldpeak is measure of ST depression induced by exercise relative to rest
* sip here is the slope of the peak exercise ST segment(as per my research)
* thal is also a heart disease ,may be its are given,and we can observe that more count of people have type 2 thal.



In [None]:
#lets check coorelation matrix
df.corr()

In [None]:
#visulaising the correlation matrix
plt.tight_layout()
plt.figure(figsize=(15,5))
sns.heatmap(df.corr(),cbar="viridius",annot=True)
#it is observed that attributes are not so highly correlated 

In [None]:
#Bivariate Analysis
#lets divide the age group and see tha chances of heart attack among age<45,45>and<=60 , >60
plt.figure(figsize=(10,5))
age_less_45=df[df["age"]<45]
sns.countplot(x="output",hue="sex",data=age_less_45)

In [None]:
plt.figure(figsize=(10,5))
age_bw_45_60=df[(df["age"]>=45) & (df["age"]<=60)]
sns.countplot(x="output",hue="sex",data=age_bw_45_60)

In [None]:
plt.figure(figsize=(10,5))
age_grtr_60=df[df["age"]>60]
sns.countplot(x="output",hue="sex",data=age_grtr_60)

**Observation:**
comparing above age group results-it is observed that  people b/w age group of 45-60 are having  more heartattack chances and among them majority  are males.

In [None]:
#lets divide on the basis of cholestoral  level
chol_1=df[df["chol"]<200]
chol_3=df[df["chol"]>=240]
chol_2=df[(df["chol"]>=200) & (df["chol"]<=239)]
col=[chol_1,chol_2,chol_3]
for i in col:
    plt.figure(figsize=(10,5))
    sns.countplot(x="output",hue="sex",data=i)
  

**Observation:**
we can observe that male who have moderate cholestoral level are more prone to  heart attack i.e cholestoral level b/w 200 and 239**
     

In [None]:
df.describe(percentiles=[.25,.50,.75,.98,.99])#checking for outliers,looking good 

**Preparing the model**

In [None]:
#scaling the data using Standardscalar
scaled=StandardScaler().fit_transform(df.drop("output",axis=1))
scaled
scaled=pd.DataFrame(scaled,columns=['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh',
       'exng', 'oldpeak', 'slp', 'caa', 'thall'])
scaled.head()

In [None]:
X=scaled
y=df[["output"]]
#dividing the dataset intto input and output variables 

In [None]:
#spliting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
gc.collect()#memory clearing 

In [None]:
print(y_train.value_counts()[0]/len(y_train),"% of label 0 in train data")
print(y_train.value_counts()[1]/len(y_train),"% of label 1 in train data")
print(y_test.value_counts()[0]/len(y_test),"% of label 0 in train data")
print(y_test.value_counts()[1]/len(y_test),"% of label 1 in train data")

#we can see that almost good amount of data is distributed in train and test

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#shape of train and test data


In [None]:
#fitting the models and predicting the accuracy scores of default models and cross validation scores as well
model=[LogisticRegression(),DecisionTreeClassifier(),GaussianNB(),SVC(),KNeighborsClassifier(),
      RandomForestClassifier(),AdaBoostClassifier(),GradientBoostingClassifier()]
model_score=pd.DataFrame(columns=["Model","Accuracy","CV_Mean_Accuracy"])
for m in model:
    m.fit(X_train,y_train)
    pred=m.predict(X_test)
    print("Accuracy score of {} model is".format(m),accuracy_score(y_test,pred),"\n")
    cv=cross_val_score(estimator=m,X=X,y=y,scoring="accuracy",cv=10)
    print("Cross validation score of {} model is ".format(m),list(cv),"\n")
    print("Mean score of cross validation of {} model is ".format(m),cv.mean(),"\n")
    print("confusion matrix for{} model".format(m),"\n",confusion_matrix(y_test,pred))
    print("\n",classification_report(y_test,pred))
    print("------------------------------------------------------------------------------------")
    print("\n")
    model_score=model_score.append([{"Model":m,"Accuracy":accuracy_score(y_test,pred),"CV_Mean_Accuracy":cv.mean()}],ignore_index=True)

In [None]:
model_score
#observations:best model is random forest in terms of cv scores and worst model is DT

In [None]:
gc.collect()#clearing memory

**Observations:**
* It is observed that among above models Random Forrest model is performed very well in terms of cross validation accuracy and DecsionTree classifier came to be worst performer.
* Cosndering -Logistic Regression ,Support Vector Machines ,Random Forest Classifier,KNN,AdaBoost models and finding the best hyperparameters


**Lets tune the hyperparameters and see whether accuracy score is improved**

In [None]:
#LogisticRegression model
log=LogisticRegression()
param_grid={"C":[0.001,0.01,0.1,0.5,1,2,10], "penalty":['l1', 'l2'],"max_iter":[50,100,200,300] }
grid_lr=GridSearchCV(estimator=log,param_grid=param_grid,
                     scoring="accuracy",cv=10,return_train_score=True)
grid_lr.fit(X_train,y_train)


In [None]:
print(grid_lr.best_params_)#best params obtained by grid search 
                 

In [None]:
#lest fit the LR model with best parameters and check the accuracy
lr=LogisticRegression(C=0.5,max_iter=50,penalty='l2',random_state=100)
lr.fit(X_train,y_train)
pred=lr.predict(X_test)
acclr=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred),"accuracy score")
cv_lr=cross_val_score(estimator=lr,X=X,y=y,cv=10)
print(cv_lr.mean(),"mean_cv_accuracy score")
#we can observe cv accuracy score has been is quite decreased

In [None]:
#svm model
svm=SVC()
param_grid={"C":[0.001,0.01,0.1,0.5,1,2,10,20],"kernel":['linear', 'poly', 'rbf',],"gamma":[1e-1,1e-2,1e-4,1,2]}
grid_svm=GridSearchCV(estimator=svm,cv=10,param_grid=param_grid,scoring="accuracy",
                     return_train_score=True)
grid_svm.fit(X_train,y_train)

In [None]:
print(grid_svm.best_params_)#best prams for svc

In [None]:
#lest fit the svc model with best model and check the accuracy
svm=SVC(C=10,gamma=0.1,kernel="linear",random_state=100)
svm.fit(X_train,y_train)
pred=svm.predict(X_test)
accsvm=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred),"accuracy score")
cv_svm=cross_val_score(estimator=svm,X=X,y=y,cv=10)
print(cv_svm.mean(),"mean_cv_accuracy score")
#cv accuracy is increased

In [None]:
#KNN model
knn=KNeighborsClassifier()
param_grid={"n_neighbors":range(1,50,5),"leaf_size":range(1,50,5)}
grid_knn=GridSearchCV(estimator=knn,cv=10,param_grid=param_grid,scoring="accuracy",
                     return_train_score=True)
grid_knn.fit(X_train,y_train)

In [None]:
print(grid_knn.best_params_)#best prams for knn

In [None]:
#fitting the model with best parameter
knn=KNeighborsClassifier(leaf_size=1,n_neighbors=16)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
accknn=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred),"accuracy score")
cv_knn=cross_val_score(estimator=knn,X=X,y=y,cv=10)
print(cv_knn.mean(),"mean_cv_accuracy score")
#cv accuracy is increased 

In [None]:
#Random Forest Model
rf=RandomForestClassifier()
param_grid={"n_estimators":[500,1000,2000],"max_depth":[2,3,5]}
grid_rf=GridSearchCV(estimator=rf,cv=10,param_grid=param_grid,scoring="accuracy",
                     return_train_score=True)
grid_rf.fit(X_train,y_train)
print(grid_rf.best_params_)

In [None]:
rf=RandomForestClassifier()
param_grid={"min_samples_split":[20,30,50],
           "min_samples_leaf" : [20,30,50]}
grid_rf=GridSearchCV(estimator=rf,cv=10,param_grid=param_grid,scoring="accuracy",
                     return_train_score=True)
grid_rf.fit(X_train,y_train)
print(grid_rf.best_params_)

In [None]:
#fitting the model with best parametres
rf=RandomForestClassifier(n_estimators=500,max_depth=3,min_samples_split=50,min_samples_leaf=20,random_state=42)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
accrf=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred),"accuracy score")
cv_rf=cross_val_score(estimator=rf,X=X,y=y,cv=10)
print(cv_rf.mean(),"mean_cv_accuracy score")


In [None]:
#lets use xgboost model
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
pred=xgb.predict(X_test)
accxgb=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred))
cv_xgb=cross_val_score(estimator=xgb,X=X,y=y,cv=10)
print(cv_xgb.mean(),"mean_cv_accuracy score")



In [None]:
#finding best parameters for xgboost model
params = {'learning_rate': [0.2,.6,.8,1],
          'max_depth': [2,5,8]
         }
grid_xgb=GridSearchCV(estimator=xgb,param_grid=params,cv=10,scoring="accuracy")
grid_xgb.fit(X_train,y_train)
print(grid_xgb.best_params_)


In [None]:
params = {'n_estimators': [200,400,600,1000],
          "subsample": [0.3, 0.6, 0.9],
         }
grid_xgb=GridSearchCV(estimator=xgb,param_grid=params,cv=10,scoring="accuracy")
grid_xgb.fit(X_train,y_train)
print(grid_xgb.best_params_)

In [None]:
xgb=XGBClassifier()
params = {'n_estimators': [100,200],
          "subsample": [0.3,0.4,],
          "learning_rate":[0.2,0.3],"max_depth":[2,3]}
grid_xgb=GridSearchCV(estimator=xgb,param_grid=params,cv=10,scoring="accuracy")
grid_xgb.fit(X_train,y_train)
print(grid_xgb.best_params_)
#finding the best parametres

In [None]:
#fitting model with best params
xgb=XGBClassifier(n_estimators =100,subsample=0.3,learning_rate=0.2,max_depth=3)
xgb.fit(X_train,y_train)
pred=xgb.predict(X_test)
accxgbf=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred))
cv_xgb=cross_val_score(estimator=xgb,X=X,y=y,scoring="accuracy",cv=10)
print(cv_xgb.mean(),"mean_cv_accuracy score")

In [None]:
abc=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2))
param_grid=[{'n_estimators':[50,100,150],"learning_rate":[0.01,0.1,0.2],
            "algorithm":['SAMME'],"random_state":[40,100]}]
abc_grid=GridSearchCV(estimator=abc,param_grid=param_grid,scoring="accuracy",cv=10,return_train_score=True)
abc_grid.fit(X_train,y_train)
print(abc_grid.best_params_)

In [None]:
abc=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),algorithm="SAMME",
                      learning_rate=0.1,n_estimators=100,random_state=40)
abc.fit(X_train,y_train)
pred=abc.predict(X_test)
accabc=accuracy_score(y_test,pred)
print(accuracy_score(y_test,pred))
cv_abc=cross_val_score(estimator=abc,X=X,y=y,scoring="accuracy",cv=10)
print(cv_abc.mean(),"mean_cv_accuracy score")

In [None]:
#model scores after hyper parametre tuning
model_score=pd.DataFrame(columns=["Model","Accuracy","CV_Mean_Accuracy"])
model_score=model_score.append([{"Model":"LR","Accuracy":acclr,"CV_Mean_Accuracy":cv_lr.mean()}],ignore_index=True)
model_score=model_score.append([{"Model":"SVC","Accuracy":accsvm,"CV_Mean_Accuracy":cv_svm.mean()}],ignore_index=True)
model_score=model_score.append([{"Model":"knn","Accuracy":accknn,"CV_Mean_Accuracy":cv_knn.mean()}],ignore_index=True)
model_score=model_score.append([{"Model":"RF","Accuracy":accrf,"CV_Mean_Accuracy":cv_rf.mean()}],ignore_index=True)
model_score=model_score.append([{"Model":"XGBoost","Accuracy":accxgbf,"CV_Mean_Accuracy":cv_xgb.mean()}],ignore_index=True)
model_score=model_score.append([{"Model":"AdaBoost","Accuracy":accabc,"CV_Mean_Accuracy":cv_abc.mean()}],ignore_index=True)

In [None]:
model_score

**Observation:**
It is observed that random forest model performed quite well among all models and aroud 84% percentage of accuracy is predicted in terms of cross validation accuracy which is pretty much good.

In [None]:
#Choosing Random Forest 
rf=RandomForestClassifier(n_estimators=500,max_depth=3,min_samples_split=50,min_samples_leaf=20,random_state=42)
rf.fit(X_train,y_train)
pred=rf.predict(X_test)
accrf=accuracy_score(y_test,pred)
print("Accuracy score of final model is",round(accuracy_score(y_test,pred)*100,2),"%")
print("\nConfusion Matrix")
print(confusion_matrix(y_test,pred),"\n")
print("Classifiaction Report:","\n",classification_report(y_test,pred))
cv_rf=cross_val_score(estimator=rf,X=X,y=y,cv=10)
print("Cross Validation Scores are:",cv_rf)
print("\n")
print("Mean_Accuracy_Score of final model is",round(cv_rf.mean()*100,2),"%")


In [None]:
#visualizing the confusion matrix of final model
matrix=confusion_matrix(y_test,pred)
sns.heatmap(matrix,annot=True)
plt.title("Confusion matrix of Random Forest")


  ****Visualising the important features of the final model to predict Heart Attack****    

In [None]:
print("\t\t\t\t","Important features of RandomForest Model")
plt.figure(figsize=(15,6),dpi=100)
pd.Series(rf.feature_importances_,X.columns).sort_values(ascending=True).plot.barh(width=0.8)
print('\t\t\t\t******************************************')
print("\t\t\t\t    Accuracy of the Model is ",round(cv_rf.mean()*100,2),"%")

In [None]:
import joblib

In [None]:
joblib.dump(rf,"RF_Heartattack_Classification.pkl")