In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

from catboost import CatBoostClassifier

%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Dataframe was parsed correctly. All fields has numeric data type (int64/float64). We don't see missed data. <br>
Find out which features are numeric and which one are categorical.

In [None]:
for col in df.columns:
    print(f"{col}: {len(df[col].value_counts())}")

In [None]:
num_cols = ["age",
            "trestbps",
            "chol",
            "thalach",
            "oldpeak",
            "ca"]

cat_cols = ["sex",
            "cp",
            "fbs",
            "restecg",
            "exang",
            "slope",
            "thal"]

X_cols = num_cols + cat_cols
target_col = "target"

In [None]:
X = df[X_cols]
y = df[target_col]

In [None]:
# checking balance: classes are balanced
df[target_col].value_counts()

# Feature overview

In [None]:
f, axs = plt.subplots(2, 3, figsize=(12, 4))
for i, col  in enumerate(num_cols):
    df[col].hist(ax=axs.reshape(-1)[i], bins=10)
    axs.reshape(-1)[i].set_title(col)

f.tight_layout()

In [None]:
f, axs = plt.subplots(2, 3, figsize=(10, 8))
for i, col  in enumerate(num_cols):
    sns.boxplot(y=col, x="target", data=df,  orient='v', ax=axs.reshape(-1)[i])

f.tight_layout()

In [None]:
sns.pairplot(df, x_vars=set(num_cols), y_vars=set(num_cols), hue=target_col);

"ca", "oldpeak", "thalach" seems to be significant features

In [None]:
f, axs = plt.subplots(3, 3, figsize=(10, 8))
for i, col in enumerate(cat_cols):
    sns.countplot(x=col, data=df, hue=target_col, ax=axs.reshape(-1)[i])
f.tight_layout()

"cp", "thal" seems to be significant features

In [None]:
fig, ax = plt.subplots(figsize=(12, 12))  
sns.heatmap(np.round(df.corr(), 2), annot=True, ax=ax);

"trestbps", "chol", "fbs", "restecg" columns seems to be useless for classification because they are not correlate with target and there were no correlations with other features in pairplot.

"ca", "oldpeak", "thalach", "cp", "thal" are correlated with target

# Preprocessing

In [None]:
# categorical columns "sex", "fbs", "exang" are already encoded
ohe_cols = ["cp", "restecg", "slope", "thal"]

class OHE_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, ohe_cols):
        self.ohe_cols = ohe_cols
        self.ohe = OneHotEncoder(sparse=False)

    def fit(self, X, y=None):
        self.ohe.fit(X[self.ohe_cols])
        return self

    def transform(self, X):
        encodedX = self.ohe.transform(X[self.ohe_cols])
        encoded_cols = self.ohe.get_feature_names(self.ohe_cols)
        encodedX_df = pd.DataFrame(encodedX, columns=encoded_cols, index=X.index)
        newX = pd.concat([X[X.columns.difference(self.ohe_cols)], encodedX_df], axis=1)
        return newX

class ColumnNameSaver(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if hasattr(X, "columns"):
            self.cols = X.columns
        else:
            self.cols = [f"col_{i}" for i in range(X.shape[1])]
        return X

def ColumnRemoveFunc(X, remove_cols=None):
    if remove_cols is None:
        return X
    
    newX = X[X.columns.difference(remove_cols)]
    return newX
    
def ColumnRemoveTransformer(remove_cols):
    return FunctionTransformer(ColumnRemoveFunc, kw_args={"remove_cols":remove_cols})
    
def ShowLogregCoef(pipe):
    cols = pipe["cols"].cols
    coefs = pipe["logreg"].coef_.reshape(-1)
    coefs = np.abs(np.round(coefs, 4))
    a = pd.DataFrame(coefs, index=cols)

    fig, ax = plt.subplots(figsize=(4, 8)) 
    sns.heatmap(a, annot=True, ax=ax)

def PrintRocAucScore(model, X, y):
    roc_aucs = cross_val_score(model, X, y, cv=10, scoring="roc_auc", n_jobs=-1)
    print(f"scores: {' '.join(roc_aucs.round(5).astype('str'))}")
    print(f"avg: {roc_aucs.mean().round(5)}")
    print(f"std: {roc_aucs.std().round(5)}")

def ShowRocCurvePlot(model, X_train, X_test, y_train, y_test):
    y_test_proba = model.predict_proba(X_test)[:, 1]
    y_train_proba = model.predict_proba(X_train)[:, 1]

    train_auc = metrics.roc_auc_score(y_train, y_train_proba)
    test_auc = metrics.roc_auc_score(y_test, y_test_proba)

    plt.figure(figsize=(10,7))
    plt.plot(*metrics.roc_curve(y_train, y_train_proba)[:2], label='train AUC={:.4f}'.format(train_auc))
    plt.plot(*metrics.roc_curve(y_test, y_test_proba)[:2], label='test AUC={:.4f}'.format(test_auc))
    legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
    plt.show()


# Cluster analysis

In [None]:
clustering_pipeline = Pipeline(
    [("ohe", OHE_Transformer(ohe_cols)),
     ("rmcol", ColumnRemoveTransformer(["cp_1", "restecg_0", "slope_0", "thal_1"])),
     ("cols", ColumnNameSaver()),
     ("scaler", StandardScaler())]
)
XPreprocessed = clustering_pipeline.fit_transform(X)
XPreprocessed = pd.DataFrame(XPreprocessed, columns=clustering_pipeline["cols"].cols)

In [None]:
Z = linkage(XPreprocessed, 'ward')
fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z, color_threshold=23)
plt.axhline(y=35, linestyle='--', color='b', label="main clusters") 
plt.axhline(y=23, linestyle='--', color='r', label="outliers")
plt.legend()
plt.show()

There are 2 small outlier clusters and 2 main clusters

In [None]:
df["main_cluster"] = fcluster(Z, 35, criterion="distance")
df["outlier_cluster"] = fcluster(Z, 23, criterion="distance")
df["outlier_cluster"].value_counts()

Cluster 1 and 4 are outliers. Remove them from the data.

In [None]:
df.drop(df[df["outlier_cluster"].isin([1, 4])].index, inplace=True)
df.drop(columns="outlier_cluster", inplace=True)
# update X and y
X = df[X_cols]
y = df[target_col]

In [None]:
df.groupby("main_cluster").mean()

In [None]:
a = df.groupby("main_cluster").mean()
b = df.describe()
clusters_df = pd.concat([df.groupby("main_cluster").mean(), df.describe()], axis=0)
clusters_df

Clusters could be separated by features: cp, exang, oldpeak, slope, ca, thalach, thal. <br>
Cluster1 tends to be target class 0 (target mean = 0.21) <br>
Cluster2 tends to be target class 1 (target mean = 0.77) <br>
Maybe there are more than 2 clusters but we don't have enough data to find them.

In [None]:
clusters_df[["cp", "exang", "oldpeak", "slope", "ca", "thalach", "thal", "target"]]

# LogisticRegression

In [None]:
logreg_pipeline = Pipeline(
    [("ohe", OHE_Transformer(ohe_cols)),
     ("cols", ColumnNameSaver()),
     ("scaler", StandardScaler()),
     ("logreg", LogisticRegression(max_iter=1000, penalty="l1", solver="saga"))]
)

logreg_pipeline.fit(X, y)

In [None]:
ShowLogregCoef(logreg_pipeline)

First of all we should remove useless features gathered from categorical features after OHE (1 from each feature)
because it could be calculated using others. These features don't give us any additional information.
So, removing "cp_1", "restecg_0", "slope_0", "thal_1" and rerun the model.

In [None]:
logreg_pipeline = Pipeline(
    
    [("ohe", OHE_Transformer(ohe_cols)),
     ("rmcol", ColumnRemoveTransformer(["cp_1", "restecg_0", "slope_0", "thal_1"])),
     ("cols", ColumnNameSaver()),
     ("scaler", StandardScaler()),
     
     ("logreg", LogisticRegression(max_iter=1000, penalty="l1", solver="saga"))]
)

logreg_pipeline.fit(X, y)
ShowLogregCoef(logreg_pipeline)

There are still several features with low coef: "age", "fbs", "slope_2", "thal_2". Try SVD method to ensure.

In [None]:
preprocessing_pipeline = Pipeline(
    [("ohe", OHE_Transformer(ohe_cols)),
     ("cols", ColumnNameSaver()),
     ("scaler", StandardScaler())]
)
XPreprocessed = preprocessing_pipeline.fit_transform(X, y)
XPreprocessed

U, S, V = np.linalg.svd(XPreprocessed)  # svd разложение 
print(np.round(S, 2))
plt.grid()
plt.axis([0, len(S) + 1, -1, 38])
plt.plot(range(1, len(S) + 1), S, '--o')
plt.axvline(x=14, color='g', linestyle='--', label="little info threshold")
plt.axvline(x=17, color='r', linestyle='--', label="useless threshold")
plt.legend()
plt.show()

SVD analysis confirms that removed features were totally useless. There is also a small step around N=14, so 3 more features could be removed too. Moreover, we could see that there is one feature which seems to be the most importnant ("ca"). The optimal feature number is between 13 and 17.

In [None]:
logreg_pipeline = Pipeline(
    [("ohe", OHE_Transformer(ohe_cols)),
     ("rmcol", ColumnRemoveTransformer(["cp_1", "restecg_0", "slope_0", "thal_1"])),
     ("scaler", StandardScaler()),
     ("pca", PCA(n_components=14)),
     ("cols", ColumnNameSaver()),
     ("logreg", LogisticRegression(max_iter=1000, penalty="l1", solver="saga"))]
)

logreg_pipeline.fit(X, y)
ShowLogregCoef(logreg_pipeline)

In [None]:
# Testing logreg params
params = [
    {
        "logreg__C": [100, 10, 1, 0.1, 0.01, 0.001],
        "logreg__penalty": ["l2"],
        "logreg__class_weight": ["balanced", None],
        "logreg__l1_ratio": [None],
        "pca__n_components": [13, 14, 15, 16, 17],
    },

    {
        "logreg__C": [100, 10, 1, 0.1, 0.01, 0.001],
        "logreg__penalty": ["l1"],
        "logreg__class_weight": ["balanced", None],
        "logreg__l1_ratio": [None],
        "pca__n_components": [13, 14, 15, 16, 17],
    },

    {
        "logreg__C": [100, 10, 1, 0.1, 0.01, 0.001],
        "logreg__penalty": ["elasticnet"],
        "logreg__class_weight": ["balanced", None],
        "logreg__l1_ratio": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        "pca__n_components": [13, 14, 15, 16, 17],
    }
]


logreg_grid = GridSearchCV(logreg_pipeline, params, cv=10, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
logreg_grid.fit(X, y)
print(logreg_grid.best_params_)

In [None]:
best_params = {
    'logreg__C': 0.01, 
    'logreg__class_weight': 'None', 
    'logreg__l1_ratio': None, 
    'logreg__penalty': 'l2', 
    'pca__n_components': 13}

logreg_pipeline.set_params(**best_params)
PrintRocAucScore(logreg_pipeline, X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
logreg_pipeline.fit(X_train, y_train)
ShowRocCurvePlot(logreg_pipeline, X_train, X_test, y_train, y_test)

In [None]:
logreg_pipeline.fit(X, y)
ShowLogregCoef(logreg_pipeline)

Mean ROC AUC score on cv is ~0.91. Weights shows that there is 1 main feature. Others don't influence much on target.

# Catboost

In [None]:
# we don't preprocess categorical features here somehow, because catboost is able to do it better
catboost_pipeline = Pipeline(
    [("cb", CatBoostClassifier(loss_function='Logloss',
                               verbose=False,
                               cat_features=cat_cols,
                               random_seed = 42,
                               eval_metric='AUC'))]
)

PrintRocAucScore(catboost_pipeline, X, y)

In [None]:
params = {
    "cb__depth": [4, 6, 8],
    "cb__learning_rate": [0.01, 0.1, 1],
    "cb__l2_leaf_reg": [0.1, 1, 10, 50],
    "cb__iterations": [100, 200, 400],
}

cb_grid = GridSearchCV(catboost_pipeline, params, cv=5, verbose=4, scoring='roc_auc', refit=True, n_jobs=-1)
cb_grid.fit(X, y)
print(cb_grid.best_params_)

In [None]:
best_params = {"cb__depth": 4, 
               "cb__iterations": 100, 
               "cb__l2_leaf_reg": 10, 
               "cb__learning_rate": 0.1}

catboost_pipeline.set_params(**best_params)
PrintRocAucScore(catboost_pipeline, X, y)

Mean ROC AUC score on cv is ~0.91

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
catboost_pipeline.fit(X_train, y_train)
ShowRocCurvePlot(catboost_pipeline, X_train, X_test, y_train, y_test)

Since we don't have much data (only 303 samples) it is hard to estimate quality of models. Depending on the split we could get underfitting or overfitting roc curves. The avarage ROC AUC of both models is about 0.91