# Imports

In [None]:
# %reload_ext nb_black
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly
import seaborn as sns
import plotly.express as px

%matplotlib inline

# plt.style.use(["dark_background"])
# %matplotlib ipympl
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV
from category_encoders import LeaveOneOutEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import warnings

# from functions_pkg import print_vif, predictions_df
from sklearn.metrics import (
    plot_confusion_matrix,
    plot_roc_curve,
    plot_precision_recall_curve,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
)
from sklearn.calibration import calibration_curve

In [None]:
# functions from package that won't import into notebook
def print_vif(feature_df):
    """
    Utility for checking multicollinearity assumption
    :param feature_df: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        feature_df = sm.add_constant(feature_df)

    vifs = []
    for i in range(feature_df.shape[1]):
        vif = variance_inflation_factor(feature_df.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=feature_df.columns))
    print("-------------------------------\n")


def predictions_df(X_test, y_test, y_preds):
    """
    Function to create a predictions dataframe from X_test, y_test, y_predictions input

    :param X_test:
    :param y_test:
    :param y_preds: X_test predictions; model.predict(X_test)
    :return pred_df, fig: returns predictions data frame and plotly express fig object
    """

    pred_df = X_test.copy()
    pred_df["y_true"] = y_test
    pred_df["y_preds"] = y_preds
    pred_df["residuals"] = pred_df.y_preds - pred_df.y_true
    pred_df["abs_residuals"] = pred_df.residuals.abs()
    pred_df = pred_df.sort_values("abs_residuals", ascending=False)

    fig = px.scatter(data_frame=pred_df, x="y_true", y="y_preds")
    fig.add_shape(
        type="line", x0=y_test.min(), y0=y_test.min(), x1=y_test.max(), y1=y_test.max()
    )

    return pred_df, fig

* Categorical or continuous
* 10,000+ samples
* 20+ features

age = days (int)

In [None]:
path = "../input/cardiovascular-disease-dataset/cardio_train.csv"
df = pd.read_csv(path, sep=";", index_col="id")

In [None]:
# column renaming
mapping = {
    "ap_hi": "bp_hi",
    "ap_lo": "bp_lo",
    "gluc": "glucose",
    "alco": "alcohol",
    "cardio": "disease",
}

df = df.rename(columns=mapping)

In [None]:
# dataset is well balanced
display(df.disease.value_counts())

# gender is a bit unbalanced in dataset
display(df.gender.value_counts())

In [None]:
# no null values in the data
df.isna().mean().sort_values(ascending=False)

In [None]:
df.head()

# Data Cleaning

In [None]:
# change gender to 0-1 binary
df.loc[:, "gender"] = df.gender - 1

In [None]:
# reduce interval in cholesterol & glucose from 1-3 to 0-2
df.loc[:, "cholesterol"] = df.cholesterol - 1
df.loc[:, "glucose"] = df.glucose - 1

# Exploration

In [None]:
num_cols = ["age", "bp_hi", "bp_lo"]


In [None]:
for col in num_cols:
    sns.violinplot(x="disease", y=col, data=df)
    plt.show()

## BP value errors

In [None]:
# extreme values in bp_hi need to be corrected
bp_cols = ["bp_hi", "bp_lo"]
for col in bp_cols:
    sns.violinplot(x="disease", y=col, data=df)
    plt.show()

In [None]:
# 993 samples with extreme values for bp_hi or bp_lo
idx = df[(abs(df.bp_hi) > 300) | (abs(df.bp_lo) > 200)].index
df = df.drop(index=idx)

In [None]:
# drop samples with negative bp_values
idx = df[(df.bp_hi < 0) | (df.bp_lo < 0)].index
df = df.drop(index=idx)

In [None]:
# drop samples with bp_hi or bp_lo values less than 50; data entry error
idx = df[(df.bp_lo < 50) | (df.bp_hi < 50)].index
df = df.drop(index=idx)

## Height value errors

In [None]:
# create column for height in ft
df["height_ft"] = df.height / 30.48

# drop samples with heights below 5 feet and above 7 feet
idx = df[(df.height_ft < 4.5) | (df.height_ft > 7)].index
df = df.drop(index=idx)

---
# Features

In [None]:
# blood pressure difference column
df["bp_diff"] = df.bp_hi - df.bp_lo

# BMI column to replace height and weight
# bmi = weight (kgs) / (height (m))^2
df["bmi"] = df.weight / (df.height / 100) ** 2

# added some more common measurement unit columns for better understanding
df["yrs"] = df.age / 365
df["height_ft"] = df.height / 30.48
df["weight_lbs"] = df.weight * 2.205

In [None]:
# extreme values in bp_hi need to be corrected
bp_cols = ["bp_diff", "bmi", "height_ft", "weight_lbs"]
for col in bp_cols:
    sns.violinplot(x="disease", y=col, data=df)
    plt.show()

In [None]:
# 68,621 samples after dropping errors
df.shape[0]

In [None]:
feat = df["weight"]
feat1 = df[df.disease == 1]["weight"]
feat0 = df[df.disease == 0]["weight"]
fig, ax = plt.subplots()
sns.distplot(feat1, color="#b51616", label="Disease")
sns.distplot(feat0, color="#0bbd1a", label="No Disease")
ax.set_xlabel("weight")
ax.set_title(f"{'weight'} Distribution")
ax.legend()
plt.show()

In [None]:
import plotly.figure_factory as ff

# Group data together
hist_data = [feat1, feat0]

group_labels = ["Disease", "No Disease"]

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title_text=f"{'weight'} Distribution", xaxis_title="weight")
fig.show()

In [None]:
# feat = df[stat]
# feat1 = df[df.disease == 1][stat].value_counts()
# feat0 = df[df.disease == 0][stat].value_counts()
feat = df["cholesterol"]
feat1 = df[df.disease == 1]["cholesterol"].value_counts()
feat0 = df[df.disease == 0]["cholesterol"].value_counts()
#     d = {'no_disease':feat0, 'disease':feat1}
#     f = pd.DataFrame(data=d)
fig, ax = plt.subplots()
#     st.write(f.style.background_gradient())
# fig, ax = plt.subplots()
width = 0.25
cd = ax.bar(
    x=feat1.index - width / 2,
    height=feat1,
    width=width,
    color="#e60909",
    label="Disease",
)
no_cd = ax.bar(
    x=feat0.index + width / 2,
    height=feat0,
    width=width,
    color="#09e648",
    label="No Disease",
)

# Attach a text label above each bar in *rects*, displaying its height
for rect in cd:
    height = rect.get_height()
    ax.annotate(
        "{}".format(height),
        xy=(rect.get_x() + rect.get_width() / 2, height),
        xytext=(0, 3),  # 3 points vertical offset
        textcoords="offset points",
        ha="center",
        va="bottom",
    )
for rect in no_cd:
    height = rect.get_height()
    ax.annotate(
        "{}".format(height),
        xy=(rect.get_x() + rect.get_width() / 2, height),
        xytext=(0, 3),  # 3 points vertical offset
        textcoords="offset points",
        ha="center",
        va="bottom",
    )

ax.set_xlabel("cholesterol")
ax.set_xticks(feat.unique())
ax.set_title("cholesterol")
ax.legend()
fig.tight_layout()
plt.show()
# st.pyplot(fig)

In [None]:
import plotly.graph_objects as go

stat = "Cholesterol"
intervals = list(feat.unique())

fig = go.Figure(
    data=[
        go.Bar(name="Disease", x=intervals, y=list(feat1)),
        go.Bar(name="No Disease", x=intervals, y=list(feat0)),
    ]
)
# Change the bar mode
fig.update_layout(
    barmode="group", title_text=f"{stat} Distribution", xaxis_title=f"{stat} Values"
)
fig.show()

In [None]:
sns.catplot(x="gender", y="bp_hi", hue="disease", kind="violin", split=True, data=df)
plt.show()
sns.catplot(x="gender", y="bp_lo", hue="disease", kind="violin", split=True, data=df)
plt.show()
sns.catplot(x="gender", y="bp_diff", hue="disease", kind="violin", split=True, data=df)
plt.show()

In [None]:
import plotly.graph_objects as go


fig = go.Figure()

fig.add_trace(
    go.Violin(
        x=df["gender"][df["disease"] == 0],  # no disease
        y=df["bp_hi"][df["disease"] == 0],
        legendgroup="No Disease",
        scalegroup="Yes",
        name="No Disease",
        side="negative",
        line_color="#09e648",
    )
)
fig.add_trace(
    go.Violin(
        x=df["gender"][df["disease"] == 1],
        y=df["bp_hi"][df["disease"] == 1],
        legendgroup="Disease",
        scalegroup="Disease",
        name="Disease",
        side="positive",
        line_color="#e60909",
    )
)
fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode="overlay")
fig.show()

# Gradient Boosting Model

In [None]:
df.head()

In [None]:
drop_cols = [
    "disease",
    "yrs",
    "height_ft",
    "bp_diff",
    "weight_lbs",
    #     "bmi",
    #     "height",
    "weight",
]

X = df.drop(columns=drop_cols)
y = df.disease

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=28, stratify=df.disease
)

In [None]:
# checking the variance inflation factor to identify any redundancy in the feature vars
# weight and bmi understandably show high vif which is why weight was dropped from the feature vars
print_vif(X_train)

> Categorical columns encoded; decision tree class models don't require numerical scaling

In [None]:
# categorical columns to be encoded
cat_cols = ["cholesterol", "glucose"]
drop_cat = [0, 0]
# data preprocessing
preprocessing = ColumnTransformer(
    [
        #         ("encode_cats", OneHotEncoder(drop=drop_cat), cat_cols),
        #         ("encode_cats", LeaveOneOutEncoder(), cat_cols),
    ],
    remainder="passthrough",
)

In [None]:
pipeline = Pipeline(
    [
        ("processing", preprocessing),
        ("model", XGBClassifier(use_label_encoder=False)),
    ]
)

pipeline.fit(X_train, y_train)

train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print(f"\ntrain score: {train_score}")
print(f"test score: {test_score}")

---
## XGB hyperparameter tuning

XGBoost

In [None]:
# grid search values other than optimal hyperparameters removed to lower notebook run time
# fmt: off
grid = {
    "model__n_estimators": np.arange(1, 3),
    "model__learning_rate": np.arange(0, 50, 10),
    #     "model__subsample": [],
    "model__colsample_bytree": np.arange(0.7,1,0.1),
    "model__max_depth": np.arange(4,7),
}
# fmt: on
pipeline_cv = GridSearchCV(pipeline, grid, cv=2, verbose=2, n_jobs=-1)
pipeline_cv.fit(X_train, y_train)

best_params = pipeline_cv.best_params_
best_params

In [None]:
train_score = pipeline_cv.score(X_train, y_train)
test_score = pipeline_cv.score(X_test, y_test)

print(f"train_score {train_score}")
print(f"test_score {test_score}")

---
## XGB feature importance

In [None]:
feature_importances = pipeline_cv.best_estimator_["model"].feature_importances_
feature_importances = pd.DataFrame(
    {"feature": X_train.columns, "importance": feature_importances}
).sort_values("importance", ascending=False)
feature_importances

---
## XGB predictions

In [None]:
y_preds = pipeline_cv.predict(X_test)
preds_df, fig = predictions_df(X_test, y_test, y_preds)

# confusion matrix
cm = confusion_matrix(y_test, y_preds)
display(cm)

# classification report
class_report = classification_report(y_test, y_preds)
print(class_report)

# prediction probabilities
pred_prob = pipeline_cv.predict_proba(X_test)
# add prediction probs to preds_df
preds_df["pred_prob"] = pred_prob[:, 1]

preds_df = preds_df.drop(columns=["residuals", "abs_residuals"])
# preds_df.head()

In [None]:
prob_true, prob_pred = calibration_curve(y_test, pred_prob[:, 1], n_bins=10)
plt.plot(prob_pred, prob_true, "-o")
plt.show()

In [None]:
# # changing the prediction percentage threshold to 45%
# adj_preds.loc[adj_preds.pred_prob > 0.49, "y_preds"] = 1

# # classification report with new threshold
# print(classification_report(adj_preds.y_true, adj_preds.y_preds))

---
## XGB error analysis

> False negatives

In [None]:
# dataframe for false negatives sorted by prediction probability descending
f_negs = preds_df[(preds_df.y_true == 1) & (preds_df.y_preds == 0)].sort_values(
    "pred_prob", ascending=False
)
f_negs

In [None]:
f_negs.mean()

> prediction probability distribution

In [None]:
preds_df.pred_prob.hist()

> predicitve value of XGBoost model is limited

---
---
# Logistic Regression Model

In [None]:
# drop columns for testing sets
drop_cols = [
    "disease",
    "yrs",
    "height_ft",
    "bp_diff",
    "weight_lbs",
    #     "smoke",
    #     "active",
    #     "alcohol",
    #     "bmi",
    #     "height",
    "weight",
]

# train test split of data
X = df.drop(columns=drop_cols)
y = df.disease

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=28, stratify=df["gender"]
)

In [None]:
print_vif(X_train)

In [None]:
# categorical columns to be encoded
cat_cols = ["cholesterol", "glucose"]

num_cols = [
    "age",
    "height",
    #     "weight",
    "bp_hi",
    "bp_lo",
]
drop_cat = [0, 0]
preprocessing = ColumnTransformer(
    [
        #         ("encode_cats", OneHotEncoder(drop=drop_cat), cat_cols),
        ("encode_cats", LeaveOneOutEncoder(), cat_cols),
        ("scaler", StandardScaler(), num_cols),
        #         ("scaler", MinMaxScaler(), num_cols),
    ],
    remainder="passthrough",
)

In [None]:
# fmt: off
lr_pipeline = Pipeline(
    [
        ("processing", preprocessing),
        ("model", LogisticRegression(solver="lbfgs", penalty="none", max_iter=1000, random_state=28))
    ]
)

lr_pipeline.fit(X_train, y_train)

# best_params = pipeline.best_params_
lr_train_score = lr_pipeline.score(X_train, y_train)
lr_test_score = lr_pipeline.score(X_test, y_test)

print(f"train score: {lr_train_score}")
print(f"test score: {lr_test_score}")

---
## LR hyperparameter tuning

In [None]:
# fmt: off
lr_grid = {
    "model__solver": ['lbfgs'],
    "model__penalty": ["l2","none"],
    "model__C": [0.75],
}
# fmt: on
lr_pipeline_cv = GridSearchCV(lr_pipeline, lr_grid, cv=5, verbose=1, n_jobs=2)
lr_pipeline_cv.fit(X_train, y_train)

lr_best_params = lr_pipeline_cv.best_params_
lr_best_params

In [None]:
lr_train_score = lr_pipeline_cv.score(X_train, y_train)
lr_test_score = lr_pipeline_cv.score(X_test, y_test)

print(f"train_score {lr_train_score}")
print(f"test_score {lr_test_score}")

In [None]:
lr_pred_prob = lr_pipeline_cv.predict_proba(X_test)
# pred_prob

lr_prob_true, lr_prob_pred = calibration_curve(y_test, lr_pred_prob[:, 1], n_bins=10)
plt.plot(lr_prob_pred, lr_prob_true, "-o")
plt.show()

In [None]:
import plotly.graph_objects as go

In [None]:
# Create traces
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=lr_prob_pred, y=lr_prob_true, mode="lines", name="Calibration Curve")
)

fig.show()

---
## LR predictions

In [None]:
# prediction percentages
lr_preds = lr_pipeline_cv.predict(X_test)

# df created from predictions
lr_preds_df, _ = predictions_df(X_test, y_test, lr_preds)

# add prediction probs to preds_df
lr_preds_df["pred_prob"] = lr_pred_prob[:, 1]

# classification target, residuals not needed
lr_preds_df = lr_preds_df.drop(columns=["residuals", "abs_residuals"])

# confusion matrix
lr_cm = confusion_matrix(y_test, lr_preds)
display(lr_cm)

# classification report
print(classification_report(y_test, lr_preds))

---
## LR error analysis

In [None]:
# dataframe for false negatives sorted by prediction probability descending
lr_f_negs = lr_preds_df[
    (lr_preds_df.y_true == 1) & (lr_preds_df.y_preds == 0)
].sort_values("pred_prob", ascending=False)

In [None]:
sns.distplot(lr_preds_df.pred_prob)
plt.show()

> The overall classification ability of the logistic regression model isn't quite as good as the XGBoost model, however the predictive range of the logistic regression model is more versatile and how better predictive validity

In [None]:
X_train.mean()

In [None]:
lr_preds_df[lr_preds_df.y_true == 1].mean()

In [None]:
preds_df[preds_df.y_true == 1].mean()

In [None]:
lr_preds_df.head()