# Load data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')
data.head()

**** Let's explore each column****
1. 'age' - age in year
1. 'sex'- male and female (1 and 0)
1. 'cp' - chest pain type (0,1,2,3)
1. 'trestbps' - resting blood pressure (in mm Hg on admission to the hospital)
1. 'chol' - serum cholestoral in mg/dl
1. 'fbs'- (fasting blood sugar &gt; 120 mg/dl) (1 = true; 0 = false)
1. 'restecg' - resting electrocardiographic results
1. 'thalach' - maximum heart rate achieved
1. 'exang' - exercise induced angina (1 = yes; 0 = no)
1. 'oldpeak' - ST depression induced by exercise relative to rest
1. 'slope' - the slope of the peak exercise ST segment
1. 'ca' - number of major vessels (0-3) colored by flourosopy
1. 'thal' - 3 = normal; 6 = fixed defect; 7 = reversable defect
1. 'target' - (Yes = 1, No = 0)

In [None]:
data.describe()

In [None]:
#Change categorial columns that's contain numerical value to categorical.

data['sex'] = data['sex'].astype(str)
data['cp'] = data['cp'].astype(str)
data['fbs'] = data['fbs'].astype(str)
data['exang'] = data['exang'].astype(str)
data['ca'] = data['ca'].astype(str)
data.info()

# Explore the data

In [None]:
def plot_box(data, cols, col_x = 'target'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.boxplot(col_x, col, data=data)
        plt.xlabel(col_x) # Set text for the x axis
        plt.ylabel(col)# Set text for y axis
        plt.show()
        
num_cols = ['age','sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg','thalach', 
            'exang', 'oldpeak', 'slope', 'ca', 'thal']
plot_box(data, num_cols)

From box plot, It's seem like columns 'fbs' and 'restecg' is not effect to target.
Then I will drop these 2 columns.

In [None]:
#from box plot.Drop columns 'fbs','restecg'
data.drop(['fbs','restecg'],axis=1)
data.head()

In [None]:
#Chek missing value
data.isnull().sum()

# Train the model.

**Import Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
X = data.iloc[:,:-2]
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


**Train Model  ******

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model_1 = LogisticRegression()
model_2 = tree.DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
model_3 = GaussianNB()
model_4 = RandomForestClassifier(n_estimators=100,max_depth=4,criterion='gini')

#Create models list
models = [model_1, model_2, model_3, model_4]

#Find accuracy by using cross validation. In this case, I split data to 5 folds.   
for model in models:
    cvs = cross_val_score(model, X_train, y_train, cv=5)
    print('{0} score is {1}'.format(model,cvs.mean()))



From above, I saw the model 1 and model 4 have more accuacy than others. I select these 2 models.

In [None]:
#cHECK Test Accuracy

for model in [model_1, model_4]:
    model = model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    matrix = confusion_matrix(y_test, y_test_pred)
    print('{0} accuracy is {1}'.format(model, test_acc))
    

Test accuracy have no difference, but model 1 have higher train accuracy than model 4, Then model 1 or Logistic regression is appropiate to this data.