In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout   # Incase you want to use dropout, but I don't use it because overfitting is pretty hard
from keras import optimizers

from sklearn.model_selection import train_test_split
from sklearn import metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.random.seed(0)

data = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")
data.sample(n=5)

In [None]:
data.dtypes

In [None]:
data.info()

In [None]:
import seaborn as sns
plt.style.use('ggplot')
sns.set_style('dark')

In [None]:
plt.figure()

ax = sns.barplot(x=data.thal.value_counts().index, y=data.thal[data.target==1].value_counts().values, color='g', alpha=0.5)
ax2 = sns.barplot(x=data.thal.value_counts().index, y=data.thal[data.target==0].value_counts().values, color='r', alpha=0.5)

In [None]:
plt.figure()

ax = sns.pairplot(data, corner=True)
plt.show()

In [None]:
plt.figure()

ax = sns.distplot(a=data.oldpeak[data.target==1], bins=15)

In [None]:
plt.figure()

ax = sns.distplot(a=data.oldpeak[data.target==0], bins=15)

In [None]:
plt.figure()
ax = sns.barplot(x='slope', y='target', data=data)

In [None]:
plt.figure()

ax = sns.barplot(x='ca', y='target', data=data)

In [None]:
plt.figure()

ax = sns.barplot(x='thal', y='target', data=data)

The below step is highly important to increasing accuracy.
This basically creates more features/colummns for algorithms to learn from. This does require a data scientist to look through the data and see the correlations though. 

In [None]:
dummy1 = pd.get_dummies(data.cp)
dummy2 = pd.get_dummies(data.thal)
dummy3 = pd.get_dummies(data.restecg)
dummy4 = pd.get_dummies(data.slope)
dummy5 = pd.get_dummies(data.ca)
merge = pd.concat([data,dummy1,dummy2,dummy3,dummy4,dummy5], axis=1)   # This turns the continuous data into binary form, easier for algorithms to understand

In [None]:
y = merge['target']
X = merge.drop(['target', 'cp', 'thal', 'restecg', 'slope', 'ca'], axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
train_X.head(5)

In [None]:
test_X.head(5)

In [None]:
train_X.info()

In [None]:
test_X.info()

We can now get the mean of the training data and normalize it so that the neural network can "understand" it better

In [None]:
from sklearn.preprocessing import MinMaxScaler

feature_scaler = MinMaxScaler()
train_X = feature_scaler.fit_transform(train_X)
test_X = feature_scaler.transform(test_X)

In [None]:
pd.DataFrame(train_X).head()

We also have to create our validation data

In [None]:
Xtrain, Xval, Ytrain, Yval = train_test_split(train_X, train_y, test_size=0.2, random_state=5)

First will be the neural network with Keras

In [None]:
Ytrain.shape

In [None]:
Yval.shape

In [None]:
model = Sequential()

model.add(Dense(train_X.shape[1], input_dim=train_X.shape[1]))   
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
opt = optimizers.Adam()
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

history = model.fit(Xtrain, Ytrain, epochs=40, validation_data=(Xval, Yval))

Now we can plot the history of the training

In [None]:
plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.ylim((0.6, 1.1))
plt.legend()

We can now get our prediction based on the test data and see its accuracy

In [None]:
prediction = model.predict(test_X) > 0.5
prediction = (prediction > 0.5) * 1
accuracy_nn = metrics.accuracy_score(test_y, prediction) * 100
print(accuracy_nn)

Now we can try to use LogisticRegression and see how well it works in comparison to the neural network.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

L = LogisticRegression()

parameters = {'C': [.1, .2, .3, .4, .5, 1, 2, 5, 10]}

logreg = GridSearchCV(L, parameters, scoring='neg_mean_squared_error')

logreg.fit(train_X, train_y)
logreg.best_params_

In [None]:
model1 = LogisticRegression(C=0.1)
model1.fit(train_X, train_y)
accuracy1 = model1.score(test_X, test_y)

print('Logistic Regression Accuracy -->',((accuracy1)*100))

In [None]:
acc_test_log = round(model1.score(test_X, test_y) * 100, 2)
print(acc_test_log)

In [None]:
# coeff_df = pd.DataFrame(train_X.columns.delete(0))
# coeff_df.columns = ['Feature']
# coeff_df["Correlation"] = pd.Series(logreg_model.coef_[0])

# coeff_df.sort_values(by='Correlation', ascending=False)


### this block no longer works because of the minmaxscaler turning train_X and test_X into ndarrays
### wasn't that important anyway

The RandomForestRegressor is actually quite nice as well, especially when there is very linear data that has been created with the pd.get_dummies function.

In [None]:
from sklearn.ensemble import RandomForestClassifier

model6 = RandomForestClassifier(criterion = 'entropy',max_features = 'log2',n_estimators = 250)
model6.fit(train_X, train_y)
accuracy6 = model6.score(test_X, test_y)

print('Random Forest Classifier Accuracy -->',((accuracy6)*100))

In [None]:
from sklearn.naive_bayes import MultinomialNB
multiNB = MultinomialNB()

multiNB.fit(train_X, train_y)
accuracy5 = multiNB.score(test_X, test_y)
print('Multinomial NB Accuracy -->',((accuracy5)*100))