# 1. Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)

sns.set_style("darkgrid")

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb

# 2. General Infos About Data

In [None]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df

In [None]:
df[df.duplicated()]

In [None]:
print("Total Observations: " + str(df.shape[0]))

df = df.drop_duplicates()

print("Total Observations After Removing Duplicates: " + str(df.shape[0]))

In [None]:
df.info()

In [None]:
df.describe()

Our categorical features had encoded with using label encoder or had been considered as ordinal feature. I will remap them with meanings and I will re-encode them with using one-hot encoder.


# 2.1 Features

**age**: age in years

**sex**: sex (1 = male; 0 = female)

**cp**: chest pain type
    Value 1: typical angina
    Value 2: atypical angina
    Value 3: non-anginal pain
    Value 4: asymptomatic
    
**trestbps**: resting blood pressure (in mm Hg on admission to the hospital)

**chol**: serum cholestoral in mg/dl

**fbs**: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

**restecg**: resting electrocardiographic results
    
    Value 0: normal
    Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
    
**thalach**: maximum heart rate achieved

**exang**: exercise induced angina (1 = yes; 0 = no)

**oldpeak** = ST depression induced by exercise relative to rest

**slope**: the slope of the peak exercise ST segment
    
    Value 1: upsloping
    Value 2: flat
    Value 3: downsloping
    
**ca**: number of major vessels (0-3) colored by flourosopy

**thal**: 3 = normal; 6 = fixed defect; 7 = reversable defect

https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [None]:
df.loc[:, "slope"] = df.loc[:, "slope"].map({0: "downsloping", 1: "flat", 2: "upsloping"})
df.loc[:, "thal"] = df.loc[:, "thal"].map({1: "fixed_effect", 2: "normal", 3: "reversable_defect", 0: "else"})

# 3. Target Distribution

In [None]:
fig, ax = plt.subplots(figsize = (8, 8))
ax.pie(df.target.value_counts(), labels=["0", "1"], autopct='%1.2f%%', startangle=180)
ax.set_title("target")
plt.show()

Target value's distribution is 54.3% - 45.7%. It is balanced. So, we don't have to use stratification techniques for cross validation and splitting the data, or we don't need to applying sampling to the data.

In [None]:
cat_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
num_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

# 4. EDA

## 4.1 Categorical Features

In [None]:
def count_percentage(df, col, hue):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 6))
    order = sorted(df[col].unique())
    
    sns.countplot(col, data = df, hue = hue, ax = ax1, order = order).set_title("Counts For Feature:\n" + col)

    df_temp = df.groupby(col)[hue].value_counts(normalize = True).\
    rename("percentage").\
    reset_index()
    
    fig = sns.barplot(x = col, y = "percentage", hue = hue, data = df_temp, ax = ax2, order = order)
    fig.set_ylim(0,1)
    
    fontsize = 14 if len(order) <= 10 else 10
    for p in fig.patches:
        
        txt = "{:.1f}".format(p.get_height() * 100) + "%"
        txt_x = p.get_x() 
        txt_y = p.get_height()
        fig.text(txt_x + 0.125, txt_y + 0.02,txt, fontsize = fontsize)

    ax2.set_title("Percentages For Feature: \n" + col)

In [None]:
for col in cat_cols:
    count_percentage(df, col, "target")

### 4.1.1 Takeaways - Categorical Features

**sex**: Really effective feature. Male's target value is 75%, female's target value is 45%

**cp**: If a person doesn't have asymptomatic chest pain (encoded as 0), target value is at least 70%

**fbs**: This feature looks like ineffective. Target's values are 55% and 51% for two option.

**restecg**: Target values are 46%, 63% and 25%. This variable could be useful

**exang**: Really effective feature. 69% - 23%

**slope**: 43, 35 and 75 percent. It could be also useful.

**thal**: 50-33-78-24. It could be also useful.

# 4.2 Numerical Features

## 4.2.1 Numerical vs Target

In [None]:
def feature_dist_clas(df, col, hue):
    fig, axes = plt.subplots(1, 4, figsize = (25, 5))
    order = sorted(df[hue].unique())

    sns.histplot(x = col, hue = hue, data = df, ax = axes[0])
    sns.kdeplot(x = col, hue = hue, data = df, fill = True, ax = axes[1])
    sns.boxplot(y = col, hue = hue, data = df, x = [""] * len(df), ax = axes[2])
    sns.violinplot(y = col, hue = hue, data = df, x = [""] * len(df), ax = axes[3])
    
    fig.suptitle("For Feature:  " + col)
    axes[0].set_title("Histogram For Feature " + col)
    axes[1].set_title("KDE Plot For Feature " + col)   
    axes[2].set_title("Boxplot For Feature " + col)   
    axes[3].set_title("Violinplot For Feature " + col)   

In [None]:
for col in num_cols:
    feature_dist_clas(df, col, "target")

In [None]:
def feature_distribution(df, col):
    
    skewness = np.round(df[col].skew(), 3)
    kurtosis = np.round(df[col].kurtosis(), 3)

    fig, axes = plt.subplots(1, 3, figsize = (18, 6))

    sns.kdeplot(data = df, x = col, fill = True, ax = axes[0], color = "orangered")
    sns.boxplot(data = df, y = col, ax = axes[1], color = "orangered")
    stats.probplot(df[col], plot = axes[2])

    axes[0].set_title("Distribution \nSkewness: " + str(skewness) + "\nKurtosis: " + str(kurtosis))
    axes[1].set_title("Boxplot")
    axes[2].set_title("Probability Plot")
    fig.suptitle("For Feature:  " + col)

In [None]:
for col in num_cols:
    feature_distribution(df, col)

### 4.2.2 Takeaways - Numerical Features

Age, thalachh and oldpeak would be useful variables.

trtbps, col and oldpeak have a few outliers. thalachh has just an outlier.

Our numerical variables generally have normal distribution except oldpeak. Also, outliers at chol feature are problem for this feature's normality.

In [None]:
def heatmap(df):
    
    fig, ax = plt.subplots(figsize = (15, 15))
    
    sns.heatmap(df.corr(), cmap = "coolwarm", annot = True, fmt = ".2f", annot_kws = {"fontsize": 9},
                vmin = -1, vmax = 1, square = True, linewidths = 0.8, cbar = False)
    
heatmap(df)

# 5. Preprocessing

One hot encoding for two feature, 

Splitting the data, 

Defining cross validations, 

Scaling data with using Standard Scaler

In [None]:
encode_cols = ["slope", "thal"]

dummies = pd.get_dummies(df[encode_cols], drop_first = True)

fin = pd.concat([df, dummies], axis = 1).drop(encode_cols, axis = 1)
fin

We have 302 observations, it means we have a small data. 25-75 or 30-70 is ideal for train test proportion.

In [None]:
target = "target"
predictors = [col for col in fin.columns if col != target]

X_train, X_test, y_train, y_test = train_test_split(fin[predictors],
                                                    fin[target],
                                                    test_size = 0.25,
                                                    random_state = 42)

cv3 = KFold(n_splits = 3, shuffle = True, random_state = 42)
cv5 = KFold(n_splits = 5, shuffle = True, random_state = 42)
cv10 = KFold(n_splits = 10, shuffle = True, random_state = 42)

def cv_model(model, X = X_train, y = y_train, cv = cv5):
    return cross_val_score(model, X, y, scoring = "accuracy", cv = cv, n_jobs = -1).mean()

In [None]:
for col in num_cols:   
    scaler = StandardScaler()

    X_train[col] = scaler.fit_transform(X_train[col].values.reshape(-1, 1))
    X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))

I just remove the variable **fbs**. It is a categorical feature that has two possibility(55% - 51%). It has lowest correlation as we can from heatmap.

In [None]:
X_train2 = X_train.drop("fbs", axis = 1)
X_test2 = X_test.drop("fbs", axis = 1)

# 6. Models

I just use basic classification algorithms with their default parameters.

I also use Voting Classifier to construct ensemble models with choosing a couple of them. I won't tune hyperparameters.

In [None]:
logreg = LogisticRegression(random_state = 42)
svc = SVC(random_state=42, probability = True)
gnb = GaussianNB()
rfc = RandomForestClassifier(random_state = 42)
knnc = KNeighborsClassifier(n_jobs = -1)
lgbc = lgb.LGBMClassifier(random_state = 42, n_jobs = -1)
dtc = DecisionTreeClassifier(random_state = 42)
xgbc = xgb.XGBClassifier(random_state = 42, n_jobs = -1, use_label_encoder = False, eval_metric = "logloss")

vc_logreg_svc_rfc_knn = VotingClassifier([("logreg", logreg), ("svc", svc), ("rfc", rfc), ("knn", knnc)],
                                         voting = "soft")
vc_logreg_svc_knn = VotingClassifier([("logreg", logreg), ("svc", svc), ("knn", knnc)],
                                         voting = "soft")
vc_logreg_svc = VotingClassifier([("logreg", logreg), ("svc", svc)],
                                         voting = "soft")
vc_all = VotingClassifier([("logreg", logreg), ("svc", svc), ("gnb", gnb), ( "rfc", rfc), ("knn", knnc),
                           ("lgb", lgbc), ("dtc", dtc), ("xgb", xgbc)],
                          voting = "soft")

train_accuracy = {}
test_accuracy = {}
cv_score3 = {}
cv_score5 = {}
cv_score10 = {}

models = {
    "LogisticRegression": logreg,
    "SupportVectorMachine": svc,
    "GaussianNaiveBayes": gnb,
    "RandomForest": rfc,
    "KNN": knnc,
    "LightGBM": lgbc,
    "DecisionTree": dtc,
    "XGBoost": xgbc,
    "VotingClassifier (All Models)": vc_all,
    "VotingClassifier (Logreg-SVC)": vc_logreg_svc,
    "VotingClassifier (Logreg-SVC-KNN)": vc_logreg_svc_knn,
    "VotingClassifier (Logreg-SVC-RFC-KNN)": vc_logreg_svc_rfc_knn   
}

for name, model in models.items():
    
    model.fit(X_train2, y_train)
    train_preds = model.predict(X_train2)
    test_preds = model.predict(X_test2)
    
    train_accuracy[name] = accuracy_score(train_preds, y_train).round(4)
    test_accuracy[name] = accuracy_score(test_preds, y_test).round(4)
    cv_score3[name] = cv_model(model, X_train2, y_train, cv = cv3).round(4)
    cv_score5[name] = cv_model(model, X_train2, y_train, cv = cv5).round(4)
    cv_score10[name] = cv_model(model, X_train2, y_train, cv = cv10).round(4)
    
scores = pd.DataFrame([train_accuracy, test_accuracy, cv_score3, cv_score5, cv_score10], 
                      index = ["TrainAccuracy", "TestAccuracy", "3FoldCVScore", "5FoldCVScore", "10FoldCVScore"]).T

In [None]:
scores

### 6.1. Takeaways - Models

If we look at above table;

Linear classifiers achieves better results i.e Logistic Regression, SVM

Tree based algorithms have overfitting problem because we didn't tune hyperparameters.

Cross validation scores are unstable. Our cv scores are changing between 0.79 - 0.83 but our scores on testing data are 0.88 - 0.90

For individual models, **Logistic Regression** has best cv scores with avg **0.82** accuracy.

Also, **Support Vector Classifier** has best test accuracy with nearly **90%**

For ensemble models,

   If we use all models, overfitted models hurt our model's performance.
   
   If we build Voting Classifier with using Logistic Regression, SVM, and KNN, we get almost **91% test set accuracy** and average 83% cv score.