In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
df.head()

# COLUMNS actual meanings

age

sex 

chest pain type (4 values) 

resting blood pressure 

serum cholestoral in mg/dl 

fasting blood sugar > 120 mg/dl

resting electrocardiographic results (values 0,1,2)

maximum heart rate achieved 

exercise induced angina 

oldpeak = ST depression induced by exercise relative to rest 

the slope of the peak exercise ST segment 

number of major vessels (0-3) colored by flourosopy 

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
sns.heatmap(df.corr())

In [None]:
df['cp'].value_counts()

In [None]:
df.head()

In [None]:
features = df.iloc[: , :-1]
target = df['target']

In [None]:
features.dtypes

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)

### DecisionTreeClassifier

In [None]:
from sklearn import tree
from sklearn.metrics import f1_score

clf_tree = tree.DecisionTreeClassifier(criterion='entropy',random_state=42)
clf_tree.fit(X_train, y_train)

tree_y_pred = clf_tree.predict(X_test)

f1_score(y_test, tree_y_pred)

In [None]:
clf_tree = GridSearchCV(clf_tree , {
    'criterion': ['entropy', 'gini'],
    'max_depth': [2, 5, 10, 15],
    'min_samples_split': [2, 3, 5, 7],
    'max_features': [2, 4, 6, 8]
}, return_train_score=False)

clf_tree.fit(features, target)
clf_tree.cv_results_

In [None]:
data = pd.DataFrame(clf_tree.cv_results_)

In [None]:
data.head()

In [None]:
final_data = data[['param_max_depth', 'param_max_features', 'param_min_samples_split', 'mean_test_score']]

In [None]:
final_data.head()

In [None]:
final_data.nlargest(5, 'mean_test_score')

In [None]:
# The best params for decision tree
clf_tree.best_params_

In [None]:
# The best score for decision tree 
clf_tree.best_score_

### Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=42, max_iter=500)

In [None]:
clf_lr = clf_lr.fit(X_train, y_train)

lr_y_pred = clf_lr.predict(X_test)

f1_score(y_test, lr_y_pred)

# Logistic Regression provided 82% accuracy (best one so far)

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(random_state=42)

clf_rf = clf_rf.fit(X_train, y_train)

rf_y_pred = clf_rf.predict(X_test)

f1_score(y_test, rf_y_pred)

# Random Forest provided 86% accuracy without tuning any parameters (best one)

In [None]:
param_grid = { 
    'n_estimators': [100, 200],
    'max_depth' : [3,4],
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf' : [1,2,3],
    'min_samples_split' : [2,3]
}

clf_rfc = RandomForestClassifier(random_state=42)

clf_rfc = GridSearchCV(estimator = clf_rfc , param_grid = param_grid, cv=3, return_train_score=False)

In [None]:
clf_rfc.fit(X_train, y_train)

In [None]:
clf_rfc.cv_results_

In [None]:
rfc_df = pd.DataFrame(clf_rfc.cv_results_)

In [None]:
rfc_df.head()

In [None]:
clf_rfc.best_params_

In [None]:
clf_rfc.best_score_