In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
df.sample(10)

In [None]:
df.isnull().sum()

In [None]:
num_features=['age','trestbps','chol','thalach','oldpeak']
cat_features = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
label='target'

In [None]:
import matplotlib.pyplot as plt

for col in num_features:
    df_95 = df[col].quantile(.95)
    df_01 = df[col].quantile(.01)
    df_95 = df[(df[col] >= df_01)&(df[col] <= df_95)]
    df_95.boxplot(column=col, by=label, figsize=(12,4))
    plt.title(col)
    plt.show()

In [None]:
for col in cat_features:
    counts = df[col].value_counts().sort_index()
    fig = plt.figure()
    ax = fig.gca()
    counts.plot.bar(ax=ax)
    ax.set_title(col+' counts')
    plt.show()

In [None]:
relevant_features = num_features+cat_features
relevant_features.remove('trestbps')

In [None]:
X, y = df[relevant_features].values, df[label].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

print('Train Set Size',X_train.shape[0],'\nTest Set Size:'+str(X_test.shape[0]))

In [None]:
df[relevant_features].sample(10)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import numpy as np

numeric_features = [0,1,2,3]
numeric_transformer = Pipeline(steps=[
    ('scaler',StandardScaler())
])

categorical_features = [4,5,6,7,8,9,10,11]
categorical_transformer = Pipeline(steps=[
    ('onehot',OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', LogisticRegression(penalty='l1', solver='liblinear',random_state=42))
])

model = pipeline.fit(X_train,y_train)
print(model)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve

y_pred = model.predict(X_test)
y_scores = model.predict_proba(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n',cm,'\n')
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Overall Precision:',precision_score(y_test,y_pred))
print('Overall Recall:',recall_score(y_test,y_pred))
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC:',auc)

fpr, tpr, thresh = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', LogisticRegression(penalty='l2',solver='liblinear',random_state=42))
])

model = pipeline.fit(X_train,y_train)
print(model)

In [None]:
y_pred = model.predict(X_test)
y_scores = model.predict_proba(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n',cm,'\n')
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Overall Precision:',precision_score(y_test,y_pred))
print('Overall Recall:',recall_score(y_test,y_pred))
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC:',auc)

fpr, tpr, thresh = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', RandomForestClassifier(random_state=42))
])

model = pipeline.fit(X_train,y_train)
print(model)

In [None]:
y_pred = model.predict(X_test)
y_scores = model.predict_proba(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n',cm,'\n')
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Overall Precision:',precision_score(y_test,y_pred))
print('Overall Recall:',recall_score(y_test,y_pred))
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC:',auc)

fpr, tpr, thresh = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', GradientBoostingClassifier(random_state=42))
])

model = pipeline.fit(X_train,y_train)
print(model)

In [None]:
y_pred = model.predict(X_test)
y_scores = model.predict_proba(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n',cm,'\n')
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Overall Precision:',precision_score(y_test,y_pred))
print('Overall Recall:',recall_score(y_test,y_pred))
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC:',auc)

fpr, tpr, thresh = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

numeric_features = [0,1,2,3]
numeric_transformer = Pipeline(steps=[
    ('scaler',StandardScaler())
])

categorical_features = [4,5,6,7,8,9,10,11]
categorical_transformer = Pipeline(steps=[
    ('onehot',OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regression', LogisticRegression())
])

param_grid = [
    {'regression' : [LogisticRegression()],
     'regression__penalty' : ['l1', 'l2'],
    'regression__C' : np.logspace(-4, 4, 20),
    'regression__solver' : ['liblinear','lbfgs'],
    'regression__max_iter' : [10, 50, 100]}
]

clf = GridSearchCV(pipeline, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

In [None]:
best_clf = clf.fit(X_train, y_train)

In [None]:
best_clf.best_params_

In [None]:
y_pred = best_clf.predict(X_test)
y_scores = best_clf.predict_proba(X_test)

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n',cm,'\n')
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Overall Precision:',precision_score(y_test,y_pred))
print('Overall Recall:',recall_score(y_test,y_pred))
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC:',auc)

fpr, tpr, thresh = roc_curve(y_test, y_scores[:,1])

fig = plt.figure(figsize=(6,6))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()