Data description
There are 3 types of input features:

Objective: factual information;
Examination: results of medical examination;
Subjective: information given by the patient.
Features:

Age | Objective Feature | age | int (days)
Height | Objective Feature | height | int (cm) |
Weight | Objective Feature | weight | float (kg) |
Gender | Objective Feature | gender | categorical code |
Systolic blood pressure | Examination Feature | ap_hi | int |
Diastolic blood pressure | Examination Feature | ap_lo | int |
Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
Smoking | Subjective Feature | smoke | binary |
Alcohol intake | Subjective Feature | alco | binary |
Physical activity | Subjective Feature | active | binary |
Presence or absence of cardiovascular disease | Target Variable | cardio | binary |
All of the dataset values were collected at the moment of medical examination.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score, confusion_matrix, classification_report, plot_roc_curve

In [None]:
df = pd.read_csv("../input/cardiovascular-disease-dataset/cardio_train.csv", sep = ";")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x :'% 3f' % x)

In [None]:
df.head()

First look at the data

In [None]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [None]:
check_df(df)

In [None]:
df = df.drop('id', axis=1)
df.head()

In [None]:
#convert it to age by years
df["age"] = round(df["age"] / 365)

In [None]:
df.head()

**DATA PREP & EDA**

Adjust thresholds and check the outliers

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.10, q3=0.90):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name, q1=0.10, q3=0.90):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, q1, q3)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

We were able to separate the variables in the data set as categorical and numerical.

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

Which features have outliers?

In [None]:
cat_cols, num_cols, cat_but_car = grab_col_names(df)

for col in num_cols:
    print(col, check_outlier(df, col))

Replacing outliers with upper and lower limit

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in num_cols:
    replace_with_thresholds(df, col)

In [None]:
for col in num_cols:
    print(col, check_outlier(df, col))

In [None]:
df.isnull().values.any()

In [None]:
df.head()

**Feature Engineering**

In [None]:
df.loc[(df["age"] < 18), "NEW_AGE"] = "Young"
df.loc[(df["age"] > 18) & (df["age"] < 56), "NEW_AGE"] = "Mature"
df.loc[(df["age"] >= 56), "NEW_AGE"] = "Old"

In [None]:
cols1 = df["weight"]
cols2 = df["height"] / 100

In [None]:
df["bmi"] = (cols1) / (cols2)**2

In [None]:
df.head()

In [None]:
df.loc[(df["bmi"] < 18.5), "NEW_BMI"] = "under"
df.loc[(df["bmi"] >= 18.5) & (df["bmi"] <= 24.99) ,"NEW_BMI"] = "healthy"
df.loc[(df["bmi"] >= 25) & (df["bmi"] <= 29.99) ,"NEW_BMI"]= "over"
df.loc[(df["bmi"] >= 30), "NEW_BMI"] = "obese"

In [None]:
df["ap_hi"].max()

In [None]:
df.loc[(df["ap_lo"])<=89, "BLOOD_PRESSURE"] = "normal"
df.loc[(df["ap_lo"])>=90, "BLOOD_PRESSURE"] = "hyper"
df.loc[(df["ap_hi"])<=120, "BLOOD_PRESSURE"] = "normal"
df.loc[(df["ap_hi"])>120, "BLOOD_PRESSURE"] = "normal"
df.loc[(df["ap_hi"])>=140, "BLOOD_PRESSURE"] = "hyper"

In [None]:
df.head()

In [None]:
df.groupby('age')['cardio'].mean()

In [None]:
df.groupby("smoke")["cardio"].mean()

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

Using the label encoder for data types object and unique values equal to 2

In [None]:
# label encoder
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col].astype(str))
    return dataframe

binary_cols = [col for col in df.columns if df[col].dtypes == "O"
               and len(df[col].unique()) == 2]

for col in df.columns:
    label_encoder(df, col)

Using one-hot-encoder for categorical cols

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

ohe_cols = [col for col in df.columns if 10 >= len(df[col].unique()) > 2]

one_hot_encoder(df, ohe_cols, drop_first=True).head()

Scale the data,

I used RobustScaler because it robust the outliers

In [None]:
rs = RobustScaler()
df[num_cols] = rs.fit_transform(df[num_cols])

Define the target variable and features,
Split the data by train and test. Test size is 20% and train size is 80%

In [None]:
y = df["cardio"]
X = df.drop(["cardio"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20, random_state=1)

Build the model and train

In [None]:
log_model = LogisticRegression().fit(X_train, y_train)

Evaluate the success of the model with train set

In [None]:
# Train Accuracy
y_pred = log_model.predict(X_train)
accuracy_score(y_train, y_pred)

In [None]:
y_prob = log_model.predict_proba(X_test)[:, 1]

y_pred = log_model.predict(X_test)

In [None]:
def plot_confusion_matrix(y, y_pred):
    acc = round(accuracy_score(y, y_pred), 2)
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('y_pred')
    plt.ylabel('y')
    plt.title('Accuracy Score: {0}'.format(acc), size=10)
    plt.show()

plot_confusion_matrix(y_test, y_pred)

Evaluate the model accuracy with test set

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
plot_roc_curve(log_model, X_test, y_test)
plt.title('ROC Curve')
plt.plot([0, 1], [0, 1], 'r--')
plt.show()

In [None]:
roc_auc_score(y_test, y_prob)

In [None]:
print(classification_report(y_test, y_pred))