In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
print(df.shape)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
classes = {
    0: "Not Diseased",
    1: "Diseased"
}

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


fig, ax = plt.subplots(1,figsize = (7,7))
ax = sns.countplot(df['target'].map(classes), ax = ax)
_ = ax.set(yticks=list(range(0, 200, 15)))

In [None]:
fig, ax = plt.subplots(1, figsize=(7, 7))
targets = df["target"].map(classes)
genders = df["sex"].map({0: "female", 1:"male"})
ax = sns.countplot(targets, hue=genders, ax=ax)
_ = ax.set(yticks=list(range(0, 150, 10)))

In [None]:
palette = sns.color_palette("magma")
sns.palplot(palette)

In [None]:
features = df.columns[:-1]
fig, ax = plt.subplots(len(features),figsize = (15,60))
fig.tight_layout(pad=5)
for i, feature in enumerate(features):
    sns.distplot(df[feature], ax=ax[i], kde_kws={'bw':.1}, color = palette[0])

In [None]:
X = df.iloc[:,df.columns!='target']
y = df.target

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 42, stratify = df.target)

In [None]:
y_train.value_counts()

In [None]:
from sklearn.metrics import f1_score,make_scorer
f1_scorer = make_scorer(f1_score, greater_is_better=True)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
cv = StratifiedKFold(n_splits=10)
def auc_score(model):
    score = cross_val_score(model,X_train,y_train,scoring ='roc_auc',cv = cv, n_jobs = -1)
    return(score.mean())

In [None]:
from sklearn.metrics import SCORERS
SCORERS.keys()

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
import xgboost as xgb

scores = []
models = {
    'logistic_regression': LogisticRegressionCV,
    'decision_tree': DecisionTreeClassifier,
    'random_forest': RandomForestClassifier,
    'gbm_classifier': GradientBoostingClassifier,
    'ext_classifier':ExtraTreesClassifier,
    'xgb_classifier':xgb.XGBClassifier
}





for model_names,model in models.items():
    model_pipeline = make_pipeline(StandardScaler(),model())
    print(f"{model_names}  :{auc_score(model_pipeline)}")

In [None]:
# now moving ahead with extra tree classifier 

base_ext = make_pipeline(StandardScaler(),ExtraTreesClassifier(random_state = 42))

In [None]:
auc_score(base_ext)

In [None]:
base_ext.fit(X_train,y_train)

In [None]:
y_pred = base_ext.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, auc, plot_confusion_matrix
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(f"This is testing score : {accuracy_score(y_test,y_pred)}")
print(f"This is training score : {accuracy_score(y_train,base_ext.predict(X_train))}")

# Hence we can simply see that the model overfits

In [None]:
# lets tune the model by using optuna 


import optuna
from optuna import Trial, visualization

from optuna.samplers import TPESampler

In [None]:


def objective(trial):
   
    param = {   
        'n_estimators':trial.suggest_int('n_estimators',40,500),
        
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
                'max_depth':trial.suggest_int('max_depth', 2,10),
             'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 
             'min_samples_leaf':trial.suggest_int('min_samples_leaf', 1,10),
             'max_features': trial.suggest_categorical('max_features',['auto', 'sqrt', 'log2'])
            }
    
    model = make_pipeline(StandardScaler(),ExtraTreesClassifier(**param, random_state = 42))
    return(auc_score(model))

In [None]:
# calling the optuna study
study1 = optuna.create_study(direction='maximize',sampler=TPESampler())
study1.optimize(objective, n_trials= 300,show_progress_bar = True)

In [None]:
study1.best_params

In [None]:
optuna.visualization.plot_optimization_history(study1)


In [None]:
optuna.visualization.plot_slice(study1)

In [None]:
model_tuned = ExtraTreesClassifier(**study1.best_params,random_state = 42)

In [None]:
auc_score(model_tuned)

In [None]:
model_tuned.fit(X_train,y_train)

In [None]:
y_pred = model_tuned.predict(X_test)

In [None]:
print(f"This is testing score : {accuracy_score(y_test,y_pred)}")
print(f"This is training score : {accuracy_score(y_train,model_tuned.predict(X_train))}")


In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
plot_confusion_matrix(model_tuned, X_test, y_test)

In [None]:
positive_proba  = model_tuned.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import roc_curve
fpr,tpr,threshold  = roc_curve(y_test,positive_proba)

In [None]:
temp = pd.DataFrame({'fpr':fpr,'tpr':tpr,'threshold':threshold})
temp['gmeans'] = np.sqrt(tpr*(1-fpr))


In [None]:
temp

In [None]:
import matplotlib.pyplot as plt
# plot the roc curve for the model
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
# show the plot
plt.show()

In [None]:
class_1_probabilites= model_tuned.predict_proba(X_test)[:,1]

In [None]:
y_pred = np.where(class_1_probabilites>0.2984, 1,0)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test, y_pred))