In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings  
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

**Basic EDA**

In [None]:
data.columns

In [None]:
data.head()


In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(data.corr(),annot=True,cmap="Oranges",ax=ax)

In [None]:
sns.countplot(data["output"])

The data is well balanced

In [None]:
sns.countplot(data["sex"])

From this We know that number of men is more than number of female

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
data.boxplot(ax=ax)

There are some outliers but i have decided to keep them cause logically i think they are the reason behind the heart attck.

**Model**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import  BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
var_col = ["age","trtbps","chol","thalachh","oldpeak"]
data1 = pd.get_dummies(data,columns=['sex',"cp","fbs","restecg","exng","slp","thall"],drop_first=True )

scale = StandardScaler()
data1[var_col] = scale.fit_transform(data1[var_col])
X = data1.drop("output",axis = 1)
y = data1[["output"]]
X.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=1001)

In [None]:
params_ada = {'base_estimator__max_depth':[i for i in range(2,11,2)],
              'base_estimator__min_samples_leaf':[5,10],
              'n_estimators':[10,50,250,1000],
              'learning_rate':[0.01,0.1]}
ada= AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
adaGSCV = GridSearchCV(ada, param_grid=params_ada,verbose=3,scoring='f1',n_jobs=-1)
bagc = BaggingClassifier(base_estimator=DecisionTreeClassifier())

params_bagc = {'base_estimator__max_depth':[i for i in range(2,11,2)],
              'base_estimator__min_samples_leaf':[5,10],
              'n_estimators':[10,50,250,1000]}
bagcGSCV=  GridSearchCV(bagc, param_grid=params_bagc,verbose=3,scoring='f1',n_jobs=-1)            
model_list1 = [("ada",adaGSCV),("bagging",bagcGSCV)]

In [None]:
for modelname, model in model_list1:
  model.fit(X_train,y_train)
  model.predict(X_test)
  best_para = model.best_params_
  score_m = model.score(X_test,y_test)
  print(modelname, ":", score_m)
  print(modelname, ":", best_para)

I decided to use XGBClassifier after this 

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import *
xgb_model = XGBClassifier()
parameters_xgb = {'objective':['binary:logistic',"binary:hinge",],
                  'learning_rate': [0.01,0.1], #so called `eta` value
                  'max_depth': [i for i in range(2,11,2)],
                  'n_estimators': [510,50,250,1000]}
clf = GridSearchCV(xgb_model, parameters_xgb, n_jobs=5,
                   scoring='roc_auc',
                   verbose=3, refit=True)
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))
print(clf.best_params_)

Using XGBClassifier really helped.

In [None]:
from sklearn.metrics import confusion_matrix
prediction_XGB = clf.predict(X_test)
confusion_matrix_XGB = confusion_matrix(y_test,prediction_XGB) 
sns.heatmap(confusion_matrix_XGB,annot=True)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(X_test)))