In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
print("data has %.i columns, %.i rows" % (data.shape[1], data.shape[0]))

In [None]:

n, bins, patches = plt.hist(data.target, color='b', alpha=0.5)
plt.title("Distribution of target")
plt.grid()
plt.show()

In [None]:
data.describe()

In [None]:
def explain_na(data):
    result_dict = dict()
    for col in data.columns:
        na_num = data[col].isna().sum()
        result_dict[col] = na_num
    dataframe = pd.DataFrame.from_dict(result_dict, orient='index', columns=['# of NA values'])
    return dataframe

print(
    explain_na(data)
    )

In [None]:
def distribution_visualization(data):
    fig, axs = plt.subplots(5, 3, figsize=(15,8))
    ax = axs.ravel()

    target_one = data.loc[data.target==1]
    target_zero = data.loc[data.target==0]


    for i, col in enumerate(data.columns):
        ax[i].hist(target_one[col], color='g', alpha=.5)
        ax[i].hist(target_zero[col], color='r', alpha=.5)
        ax[i].set_title(str(col))
        ax[i].legend(['target_zero','target_one'], loc='best', fontsize=8)


    plt.tight_layout()
    plt.show()

distribution_visualization(data)

In [None]:

data.describe()



In [None]:
max(data.age)

In [None]:
data.columns


In [None]:
#### START OF LOGISTIC REGRESSION ESTIMATOR #############

scaler = StandardScaler()
pca = PCA()
logit = LogisticRegression()



In [None]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]


In [None]:

categorical_features = list()
numeric_features = list()

categorical_dict = dict()
for col in X.columns:
    categorical_dict[col] = len(np.unique(data[col]))

for key, value in categorical_dict.items():
    categorical_features.append(key) if value < 10 else numeric_features.append(key)



In [None]:
sorted(categorical_features + numeric_features) == sorted(data.columns)



In [None]:
categorical_features

In [None]:

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipeline_logit = Pipeline(steps=[('preprocessor', preprocessor),
                      ('reduce_dim', PCA()),
                      ('classifier', LogisticRegression())])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y,
                                                    random_state=1)


pipeline_logit.fit(X_train, y_train)
print("model score: %.3f" % pipeline_logit.score(X_test, y_test))


In [None]:
param_grid = {
    'reduce_dim__n_components': [5, 15, 30, 45, 64],
    'classifier__C': np.logspace(-4, 4, 4),
}






In [None]:
grid_logit = GridSearchCV(pipeline_logit, param_grid, scoring='accuracy', cv=5,  n_jobs=-1)
grid_logit.fit(X_train, y_train)
y_pred = grid_logit.predict(X_test)




In [None]:
print(classification_report(y_test, y_pred))


#### END OF LOGISTIC REGRESSION ESTIMATOR #############

In [None]:
#### START OF NEAREST NEIGHBORS ESTIMATOR #############


knn = KNeighborsClassifier()

pipe_knn = Pipeline([
                ('preprocessor', preprocessor),
                ('pca', PCA()),
                ("KNN", knn)
                ])

n_neighbors = [2,3,5,10]
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
n_components = list(range(1, X.shape[1] + 1,1))


parametrs = dict(pca__n_components=n_components,
                 KNN__n_neighbors=n_neighbors,
                 KNN__algorithm=algorithm)

knn_grid = GridSearchCV(pipe_knn, parametrs, scoring='accuracy', cv=5, n_jobs=-1)
knn_grid.fit(X_train, y_train)
y_pred = knn_grid.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred))

#### END OF NEAREST NEIGHVOHORS ESTIMATOR EMLIMITATION #############

In [None]:
#### START OF SVN ESTIMATOR #############
pipe_svc = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('pca', PCA()),
        ('SVN', SVC())
    ]
)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

kernel = ['rbf', 'linear', 'poly']
parametr_c = [10**i for i in range(0, 4)]

parametrs = dict(pca__n_components=n_components,
                 SVN__kernel=kernel,
                 SVN__C=parametr_c)

svm_grid = GridSearchCV(pipe_svc, parametrs, scoring='accuracy', cv=5, n_jobs=-1)
svm_grid.fit(X_train, y_train)
y_pred = svm_grid.predict(X_test)


print(classification_report(y_test, y_pred))

#### END OF SVN ESTIMATOR #############

In [None]:
#### START OF XGB ESTIMATOR #############


pipe_gbt = Pipeline(
    [
        ('pca', PCA()),
        ('GBT', GradientBoostingClassifier())
    ]
)

parameters = {
    "GBT__loss":["deviance"],
    "GBT__learning_rate": [0.01, 0.075, 0.1, 0.2],
    "GBT__min_samples_split": np.linspace(0.1, 0.5, 4),
    "GBT__min_samples_leaf": np.linspace(0.1, 0.5, 4),
    "GBT__max_depth":[3,5,8],
    "GBT__n_estimators":[10]
    }

gbt_grid = GridSearchCV(pipe_gbt, parameters, cv=5, n_jobs=-1)
gbt_grid.fit(X_train, y_train)

y_pred = gbt_grid.predict(X_test)

print(classification_report(y_test, y_pred))

#### END OF XGB ESTIMATOR #############

In [None]:
#### START OF NN ESTIMATOR #############

model = Sequential()
model.add(Dense(12, input_dim=13, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=300, verbose=0)


_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))


#### END OF NN ESTIMATOR #############