# 

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "Assignment3"
def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)
def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)
    # Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [2]:
import pandas as pd
DATA_PATH = os.path.join("datasets", "data_assignments")
def load_data(data_path=DATA_PATH):
    csv_path = os.path.join(data_path, "Heart.csv")
    return pd.read_csv(csv_path)
data = load_data()
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [3]:
median = data["Ca"].median()
data["Ca"].fillna(median, inplace=True) 
data=data.dropna(subset=["Thal"])

In [4]:
np.random.seed(42)

In [5]:
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [6]:
np.random.seed(42)
train_set, test_set = split_train_test(data, 0.25)

In [7]:
X=(np.array(pd.get_dummies(train_set.iloc[:, 1:14])))
y = np.array((train_set.AHD.iloc[:] == "Yes").astype(np.int))  

In [8]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)
y_res=tree_clf.predict(X)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_res)

0.8008849557522124

In [10]:
from sklearn.metrics import accuracy_score
X_new=(np.array(pd.get_dummies(test_set.iloc[:, 1:14])))
y_new = np.array((test_set.AHD.iloc[:] == "Yes").astype(np.int))  
y_res = tree_clf.predict(X_new)
accuracy_score(y_new, y_res)

0.6933333333333334

In [11]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
y_res=rnd_clf.predict(X)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y,y_res)

0.9469026548672567

In [14]:
from sklearn.metrics import accuracy_score
y_res = rnd_clf.predict(X_new)
accuracy_score(y_new, y_res)

0.8