In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.info()

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
%matplotlib inline

plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True)

In [None]:
df.hist(figsize=(20, 16))
plt.show()

In [None]:
df

In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

In [None]:
#applying different methods
models = []
models.append(['RidgeClassifier', RidgeClassifier()])
models.append(['XGBClassifier', XGBClassifier(use_label_encoder=False, objective='binary:logistic', random_state=0, eval_metric='logloss')])
models.append(['Logistic Regression', LogisticRegression(random_state=0)])
models.append(['SVM', SVC(random_state=0)])
models.append(['KNeighbors', KNeighborsClassifier()])
models.append(['GaussianNB', GaussianNB()])
models.append(['BernoulliNB', BernoulliNB()])
models.append(['DecisionTree', DecisionTreeClassifier(random_state=0)])
models.append(['AdaBoostClassifier', AdaBoostClassifier()])
models.append(['MLPClassifier', MLPClassifier(random_state=42, max_iter=1000)])
models.append(['ExtraTreesClassifier', ExtraTreesClassifier()])
models.append(['CatBoostClassifier', CatBoostClassifier(eval_metric='AUC', verbose=0)])
models.append(['GradientBoostingClassifier', GradientBoostingClassifier()])
models.append(['SGDClassifier', SGDClassifier()])

In [None]:
lst_1 = []
for m in range(len(models)):
  lst_2 = []
  model = models[m][1]
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)
  accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=10)
  #k-fold validation
  roc = roc_auc_score(y_test, y_pred)
  print(models[m][0], ':')
  print(cm)
  print('Accuracy Score: ', accuracy_score(y_test, y_pred))
  print(' ')
  print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
  print(' ')
  print('ROC AUC Score: {:.2f}'.format(roc))
  print('-'*40)
  print(' ')
  lst_2.append(models[m][0])
  lst_2.append(accuracy_score(y_test, y_pred)*100)
  lst_2.append(accuracies.mean()*100)
  lst_2.append(roc)
  lst_1.append(lst_2)

In [None]:
df2 = pd.DataFrame(lst_1, columns=['Model', 'Accuracy', 'K-Fold Mean Accuracy', 'ROC_AUC'])
df2.sort_values(by=['ROC_AUC'], inplace=True, ascending=False)

In [None]:
df2

In [None]:
fig = plt.figure(figsize=(12, 12))
sns.barplot(x='ROC_AUC', y = 'Model', data=df2, color='r')
plt.title('Model Comparision');

In [None]:
grid_models = [
               (KNeighborsClassifier(), [{'n_neighbors':np.arange(1, 100), 'metric':['euclidean', 'manhattan', 'chebyshev', 'minkowski']}]),
               (DecisionTreeClassifier(), [{'criterion':['gini', 'entropy'], 'max_depth':np.arange(1, 50), 'min_samples_leaf':[1, 2, 3]}]),
               (RandomForestClassifier(), [{'n_estimators':[100, 150, 200], 'criterion': ['gini', 'entropy'], 'min_samples_leaf':[2, 10, 30]}]),
               (MLPClassifier(),[{'solver':['lbfgs', 'sgd', 'adam'], 'learning_rate':['constant', 'invscaling', 'adaptive']}]),
               (RidgeClassifier(), [{'alpha':[0.1, 0.5, 1], 'solver':['auto', 'svd', 'cholesky']}]),
               (GaussianNB(), [{'var_smoothing': np.logspace(0, -9, num=100)}]),
               (XGBClassifier(use_label_encoder=False), [{'learning_rare':[0.01, 0.05, 0.1], 'eval_metric':['error', 'logloss']}])
]

In [None]:
for i, j in grid_models:
  grid = GridSearchCV(estimator=i, param_grid=j, scoring='roc_auc', cv=5)
  grid.fit(X_train, y_train)
  best_score = grid.best_score_
  best_param = grid.best_params_
  print(' {}: \n Best score: {:.1f} %'.format(i, best_score*100))
  print('')
  print('-'*25)
  print(' ')