In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import all the tools we need

# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# we want our plots to appear inside the notebook
%matplotlib inline 

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [None]:
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")
df.shape # (rows, columns)

### Data Exploration (exploratory data analysis or EDA)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# Let's find out how many of each class there
df["target"].value_counts()

In [None]:
# Normalized value counts
df.target.value_counts(normalize=True)

In [None]:
df["target"].value_counts().plot(kind = "bar", color = ["salmon", "lightblue"]);

In [None]:
df.info()

In [None]:
# Are there any missing data
df.isna().sum()

In [None]:
df.describe()

### Heart Disease Frequency according to Sex

In [None]:
df.sex.value_counts()

In [None]:
# Compare target column with sex column

pd.crosstab(df.target, df.sex)

In [None]:
# Create a plot of crosstab 
pd.crosstab(df.target, df.sex).plot(kind= "bar", figsize=(10, 6),color = ["salmon", "lightblue"])

plt.title(" Heart Disease Frequency for Sex")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")

plt.legend(["Female", "Male"])

plt.xticks(rotation = 0);


In [None]:
df.head()

In [None]:
df["thalach"].value_counts()

### Age vs. Max Heart Rate for Heart Disease

In [None]:
# Create another figure
plt.figure(figsize=(10, 6))

# Scatter with positive examples
plt.scatter(df.age[df.target ==1],
           df.thalach[df.target ==1],
           c= "salmon")

# Scatter with negative example

plt.scatter(df.age[df.target == 0],
           df.thalach[df.target == 0],
           c= "lightblue")
# Add some helpful info
plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")
plt.legend(["Disease", "No Disease"]);

In [None]:
# Check the distribution of the age column with a histgram

df.age.plot.hist();

### Heart Disease Frequency per Chest Pain Type

cp - chest pain type

    * 0: Typical angina: chest pain related decrease blood supply to the heart    
    * 1: Atypical angina: chest pain not related to heart
    * 2: Non-anginal pain: typically esophageal spasms (non heart related)    
    * 3: Asymptomatic: chest pain not showing signs of disease

In [None]:
pd.crosstab(df.cp, df.target)

In [None]:
# Make the crosstab more visual
pd.crosstab(df.cp, df.target).plot(kind="bar",
                                   figsize=(10, 6),
                                   color=["salmon", "lightblue"])

# Add some communication
plt.title("Heart Disease Frequency Per Chest Pain Type")
plt.xlabel("Chest Pain Type")
plt.ylabel("Amount")
plt.legend(["No Disease", "Disease"])
plt.xticks(rotation=0);

In [None]:
df.head(2)

In [None]:
# Make a correlation matrix
df.corr()

In [None]:
# Let's make it look a little prettier
corr_matrix = df.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, 
            annot=True, 
            linewidths=0.5, 
            fmt= ".2f", 
            cmap="YlGnBu");

In [None]:
df.head()

In [None]:
# Everything except target variable
X = df.drop("target", axis=1)

# Target variable
y = df.target.values

In [None]:
# Independent variables (no target column)
X.head()

In [None]:
# Targets
y

In [None]:
# Random seed for reproducibility
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, # independent variables 
                                                    y, # dependent variable
                                                    test_size = 0.2) # percentage of data to use for test set

In [None]:
X_train.head()

In [None]:
y_train, len(y_train)

In [None]:
X_test.head()

In [None]:
y_test, len(y_test)

In [None]:
# Put models in a dictionary

models = {"KNN":KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(),
          "Random Forest":RandomForestClassifier()}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores
    

In [None]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

### Model Comparison

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])

In [None]:
model_compare

In [None]:
model_compare.T

In [None]:
model_compare.T.plot.bar();

In [None]:
# Create a list of train scores
train_scores = []

# Create a list of test scores
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(X_train, y_train)
    
    # Update the training scores
    train_scores.append(knn.score(X_train, y_train))
    
    # Update the test scores
    test_scores.append(knn.score(X_test, y_test))

In [None]:
train_scores

In [None]:
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

Looking at the graph, n_neighbors = 11 seems best.

Even knowing this, the KNN's model performance didn't get near what LogisticRegression or the RandomForestClassifier did.

Because of this, we'll discard KNN and focus on the other two.

We've tuned KNN by hand but let's see how we can LogisticsRegression and RandomForestClassifier using RandomizedSearchCV.

### Tuning models with with RandomizedSearchCV

In [None]:
# Different LogisticRegression hyperparameters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression

rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions= log_reg_grid,
                               cv= 5,
                               n_iter=20, verbose=True)

# Fit random hyperparameter search model
rs_log_reg.fit(X_train, y_train);

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier

rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

# Fit random hyperparameter search model
rs_rf.fit(X_train, y_train);

In [None]:
# Find the best parameters
rs_rf.best_params_

In [None]:
# Evaluate the randomized search random forest model
rs_rf.score(X_test, y_test)

### Now Tuning a model with GridSearchCV

In [None]:
# Different LogisticRegression hyperparameters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train);

In [None]:
# Check the best parameters
gs_log_reg.best_params_

In [None]:
# Evaluate the model
gs_log_reg.score(X_test, y_test)

### Evaluating a classification model, beyond accuracy

In [None]:
# Make preidctions on test data
y_preds = gs_log_reg.predict(X_test)

In [None]:
y_preds

In [None]:
y_test

In [None]:
# Import ROC curve function from metrics module
from sklearn.metrics import plot_roc_curve

# Plot ROC curve and calculate AUC metric
plot_roc_curve(gs_log_reg, X_test, y_test);

In [None]:
# Display confusion matrix
print(confusion_matrix(y_test, y_preds))

In [None]:
# Import Seaborn
import seaborn as sns
sns.set(font_scale=1.5) # Increase font size

def plot_conf_mat(y_test, y_preds):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True, # Annotate the boxes
                     cbar=False)
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    
plot_conf_mat(y_test, y_preds)

In [None]:
# Show classification report
print(classification_report(y_test, y_preds))

In [None]:
# Check best hyperparameters
gs_log_reg.best_params_

In [None]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Instantiate best model with best hyperparameters (found with GridSearchCV)
clf = LogisticRegression(C=0.23357214690901212,
                         solver="liblinear")

In [None]:
# Cross-validated accuracy score
cv_acc = cross_val_score(clf,
                         X,
                         y,
                         cv=5, # 5-fold cross-validation
                         scoring="accuracy") # accuracy as scoring
cv_acc

In [None]:
cv_acc = np.mean(cv_acc)
cv_acc

In [None]:
# Cross-validated precision score
cv_precision = np.mean(cross_val_score(clf,
                                       X,
                                       y,
                                       cv=5, # 5-fold cross-validation
                                       scoring="precision")) # precision as scoring
cv_precision

In [None]:
# Cross-validated recall score
cv_recall = np.mean(cross_val_score(clf,
                                    X,
                                    y,
                                    cv=5, # 5-fold cross-validation
                                    scoring="recall")) # recall as scoring
cv_recall

In [None]:
# Cross-validated F1 score
cv_f1 = np.mean(cross_val_score(clf,
                                X,
                                y,
                                cv=5, # 5-fold cross-validation
                                scoring="f1")) # f1 as scoring
cv_f1

In [None]:
# Visualizing cross-validated metrics
cv_metrics = pd.DataFrame({"Accuracy": cv_acc,
                            "Precision": cv_precision,
                            "Recall": cv_recall,
                            "F1": cv_f1},
                          index=[0])
cv_metrics.T.plot.bar(title="Cross-Validated Metrics", legend=False);

In [None]:
# Fit an instance of LogisticRegression (taken from above)
clf.fit(X_train, y_train);

In [None]:
# Now Check coef_
clf.coef_

In [None]:
# Match features to columns
features_dict = dict(zip(df.columns, list(clf.coef_[0])))
features_dict

In [None]:
# Visualize feature importance
features_df = pd.DataFrame(features_dict, index=[0])
features_df.T.plot.bar(title="Feature Importance", legend=False);

In [None]:
pd.crosstab(df["sex"], df["target"])

In [None]:
# Contrast slope (positive coefficient) with target
pd.crosstab(df["slope"], df["target"])