In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn import tree

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def get_preprocessed_data():
    location = "../input/heart-attack-analysis-prediction-dataset/heart.csv"
    df = pd.read_csv(location)

    categorical = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
    continuous = ["age","trtbps","chol","thalachh","oldpeak"]
    target = ["output"]

    df.drop_duplicates(inplace=True)
    
    df = pd.get_dummies(df, columns=categorical, drop_first=True)
    
    X = df.drop(target, axis=1)
    y = df[target]

    X[continuous] = StandardScaler().fit_transform(X[continuous])

    return  {
        "X": X,
        "y": y
    }

In [None]:
data = get_preprocessed_data()
X = data["X"]
y = data["y"] 

In [None]:
### ROC und feature importances - default settings

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

xg_boost = xgb.XGBClassifier(random_state=0, use_label_encoder=False).fit(X_train, y_train)
random_forest = RandomForestClassifier(random_state=0).fit(X_train, y_train)
ada_boost = AdaBoostClassifier(random_state=0).fit(X_train, y_train)
dec_tree = tree.DecisionTreeClassifier(random_state=0).fit(X_train, y_train)

print("\nXGB:", xg_boost.score(X_test, y_test))
print("Random Forest:", random_forest.score(X_test, y_test))
print("Ada Boost:", ada_boost.score(X_test, y_test))
print("Decision Tree:", dec_tree.score(X_test, y_test))
print()

def get_roc_values(classifier, x_data, y_data):
    y_predicted = classifier.predict_proba(x_data)[:,1]
    false_positive_rate, true_positive_rate, _ = metrics.roc_curve(y_data, y_predicted)
    auc_score = metrics.roc_auc_score(y_data, y_predicted)
    return false_positive_rate, true_positive_rate, auc_score

plt.figure(1)

plt.title("\nROC Vergleich")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate\n')

fpr, tpr, auc = get_roc_values(xg_boost, X_test, y_test)
plt.plot(fpr, tpr, label="XGB=" + str(round(auc, 2)))

fpr, tpr, auc = get_roc_values(random_forest, X_test, y_test)
plt.plot(fpr, tpr, label="rF=" + str(round(auc, 2)))

fpr, tpr, auc = get_roc_values(ada_boost, X_test, y_test)
plt.plot(fpr, tpr, label="AdaB=" + str(round(auc, 2)))
plt.legend(loc=4)
plt.savefig("ROC Default.png", dpi=300)


# RF
features = X.columns.values
importances = random_forest.feature_importances_
indices = np.argsort(importances)

plt.figure(2)
plt.title('Feature Importances Random Forest (Default Konfiguration)')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - RF - Default.png", dpi=300)


# AdaB
importances = random_forest.feature_importances_
indices = np.argsort(importances)

plt.figure(3)
plt.title('Feature Importances AdaBoost (Default Konfiguration)')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - AdaB - Default.png", dpi=300)


# XGB
importances = random_forest.feature_importances_
indices = np.argsort(importances)

plt.figure(4)
plt.title('Feature Importances XGB (Default Konfiguration)')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - XGB - Default.png", dpi=300)

In [None]:
### Hyperparameter tuning

# RF
params = {
    "n_estimators": [10, 50, 100, 150],
    "min_samples_leaf": [1, 5, 10],
    "max_depth": [2, 10, 20, None],
    "min_samples_split": [2, 3, 5],
    "max_leaf_nodes": [10, 25, 50, None],
    "bootstrap": [True, False],
    "max_features":  ["sqrt", "log2", None]
    } 

grid = GridSearchCV(RandomForestClassifier(random_state=0), params, cv=5, verbose=1, n_jobs=-1)
grid.fit(X, y)

print("\nRandom Forest")
print("Best parameters: {}".format(grid.best_params_))
print("Best cross-validation score: {:.2f}\n".format(grid.best_score_))

features = X.columns.values
importances = grid.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.figure(5)
plt.title('Feature Importances Random Forest')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - RF - Tuned.png", dpi=300)

In [None]:
# AdaB
params = {
    "n_estimators": [10, 50, 100, 150],
    "learning_rate": [0.1, 0.5, 1, 2, 3]
    }

grid = GridSearchCV(AdaBoostClassifier(random_state=0), params, cv=5, verbose=1, n_jobs=-1)
grid.fit(X, y)

print("\nAdaBoost")
print("Best parameters: {}".format(grid.best_params_))
print("Best cross-validation score: {:.2f}\n".format(grid.best_score_))

importances = grid.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.figure(5)
plt.title('Feature Importances AdaBoost')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - AdaB - Tuned.png", dpi=300)

In [None]:
# XBG
params = {
    "max_depth": [2, 6, 10]
    }

grid = GridSearchCV(xgb.XGBClassifier(random_state=0), params, cv=5, verbose=1, n_jobs=-1)
grid.fit(X, y)

print("\nXGBoost")
print("Best parameters: {}".format(grid.best_params_))
print("Best cross-validation score: {:.2f}\n".format(grid.best_score_))

importances = random_forest.feature_importances_
indices = np.argsort(importances)

plt.figure(6)
plt.title('Feature Importances XGB')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.savefig("Feature Importances - XGB - Tuned.png", dpi=300)