In [25]:
import pandas as pd
import numpy as np
import acquire as a
import prepare as p
import model_functions as m

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')


## Prep for modeling

In [26]:
#import data and prep for modeling
df = a.get_heart()
df = p.rename_cols(df)
df = p.replace_cat_values(df)
df = m.df_classification_ready(df, 'sex_female')
df.head()

Unnamed: 0,age,resting_bp,cholesterol,fasting_blood_sugar>120,max_heart_rate,exercise_induced_angina,oldpeak,num_major_blood_vessels,high_risk_of_mi,sex_male,...,chest_pain_type_typical angina,rest_ecg_ST-T wave abnormal,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,st_slope_downsloping,st_slope_flat,st_slope_unsloping,defect_type_fixed_defect,defect_type_normal,defect_type_reversible
0,63,145,233,1,150,0,2.3,0,1,1,...,0,0,0,1,0,0,1,1,0,0
1,37,130,250,0,187,0,3.5,0,1,1,...,0,1,0,0,0,0,1,0,1,0
2,41,130,204,0,172,0,1.4,0,1,0,...,1,0,0,1,1,0,0,0,1,0
3,56,120,236,0,178,0,0.8,0,1,1,...,1,1,0,0,1,0,0,0,1,0
4,57,120,354,0,163,1,0.6,0,1,0,...,0,1,0,0,1,0,0,0,1,0


In [27]:
#split into train val test
train, val, test = p.split_data(df, 'high_risk_of_mi')
train.shape, val.shape, test.shape

((201, 23), (51, 23), (49, 23))

In [28]:
#isolating target variable in each dataset
X_Train, y_Train, X_val, y_val, X_test, y_test = m.isolate_target(train, val, test, 'high_risk_of_mi')
X_Train.shape, y_Train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((201, 22), (201,), (51, 22), (51,), (49, 22), (49,))

Data is ready for modeling.

Got dummies, dropped duplicates, split to train, val, test, isolated target variable for train, val, test

# Perform modeling

#### Decision Tree

In [29]:
#get mode baseline 
y_Train.value_counts()


1    109
0     92
Name: high_risk_of_mi, dtype: int64

In [30]:
#function to run multiple random forest to compare for best accuracy
def get_decision_tree_multiple(X_Train, y_Train, X_val, y_val):
    metrics = []

    for j in range (1, 10):
        for i in range(2, 10):
            clf = DecisionTreeClassifier(max_depth=i, min_samples_leaf=j, random_state=123)

            clf = clf.fit(X_Train, y_Train)
            in_sample_accuracy = clf.score(X_Train, y_Train)
            out_of_sample_accuracy = clf.score(X_val, y_val)

            output = {
                "min_samples_per_leaf": j,
                "max_depth": i,
                "train_accuracy": in_sample_accuracy,
                "validate_accuracy": out_of_sample_accuracy
            }
    
            metrics.append(output)

    df1 = pd.DataFrame(metrics)
    df1["difference"] = df1.train_accuracy - df1.validate_accuracy
    df1_sorted = df1.sort_values(by=['validate_accuracy'], ascending=False).head(10)

    return df1_sorted


In [31]:
#showing results of decision tree models
dec_tree_results = get_decision_tree_multiple(X_Train, y_Train, X_val, y_val)
dec_tree_results

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
0,1,2,0.791045,0.862745,-0.0717
27,4,5,0.875622,0.862745,0.012877
24,4,2,0.791045,0.862745,-0.0717
40,6,2,0.791045,0.862745,-0.0717
50,7,4,0.850746,0.862745,-0.011999
51,7,5,0.850746,0.862745,-0.011999
19,3,5,0.905473,0.862745,0.042728
52,7,6,0.850746,0.862745,-0.011999
53,7,7,0.850746,0.862745,-0.011999
16,3,2,0.791045,0.862745,-0.0717


Best performing Decision Tree Model:

4 min_samples_per_leaf
5 max_depth

In [32]:
best_decision_tree = pd.DataFrame((dec_tree_results.loc[27]))
best_decision_tree

Unnamed: 0,27
min_samples_per_leaf,4.0
max_depth,5.0
train_accuracy,0.875622
validate_accuracy,0.862745
difference,0.012877


#### Random Forest

In [33]:
#function to run multiple random forest to compare for best accuracy
def get_random_forest_multiple(X_Train, y_Train, X_val, y_val):
    metrics = []

    for j in range (1, 10):
        for i in range(2, 10):
            rf = RandomForestClassifier(max_depth=i, min_samples_leaf=j, random_state=123)

            rf = rf.fit(X_Train, y_Train)
            in_sample_accuracy = rf.score(X_Train, y_Train)
            out_of_sample_accuracy = rf.score(X_val, y_val)

            output = {
                "min_samples_per_leaf": j,
                "max_depth": i,
                "train_accuracy": in_sample_accuracy,
                "validate_accuracy": out_of_sample_accuracy
            }
    
            metrics.append(output)

    df1 = pd.DataFrame(metrics)
    df1["difference"] = df1.train_accuracy - df1.validate_accuracy
    df1_sorted = df1.sort_values(by=['validate_accuracy'], ascending=False).head(10)

    return df1_sorted



In [34]:
#showing results of decision tree models
rand_forest_results = get_random_forest_multiple(X_Train, y_Train, X_val, y_val)
rand_forest_results

Unnamed: 0,min_samples_per_leaf,max_depth,train_accuracy,validate_accuracy,difference
21,3,7,0.925373,0.941176,-0.015803
20,3,6,0.930348,0.941176,-0.010828
22,3,8,0.930348,0.941176,-0.010828
23,3,9,0.940299,0.941176,-0.000878
54,7,8,0.845771,0.941176,-0.095405
25,4,3,0.865672,0.941176,-0.075505
26,4,4,0.890547,0.941176,-0.050629
53,7,7,0.845771,0.941176,-0.095405
52,7,6,0.850746,0.941176,-0.09043
51,7,5,0.870647,0.941176,-0.07053


Best performing Random Forest Model:

3 min_samples_per_leaf
9 max_depth

In [35]:
best_random_forest = pd.DataFrame((rand_forest_results.loc[23]))
best_random_forest

Unnamed: 0,23
min_samples_per_leaf,3.0
max_depth,9.0
train_accuracy,0.940299
validate_accuracy,0.941176
difference,-0.000878


#### KNN

In [36]:
#function run multiple KNN to compare for best accuracy
def get_knn(X_Train, y_Train, X_val, y_val):
    metrics = []

    for i in range(2, 10):
        knn = KNeighborsClassifier(n_neighbors=i, weights='uniform')
        knn = knn.fit(X_Train, y_Train)
        in_sample_accuracy = knn.score(X_Train, y_Train)
        out_of_sample_accuracy = knn.score(X_val, y_val)

        output = {
            "neighbors": i,
            "train_accuracy": in_sample_accuracy,
            "validate_accuracy": out_of_sample_accuracy
        }

        metrics.append(output)

    df1 = pd.DataFrame(metrics)
    df1["difference"] = df1.train_accuracy - df1.validate_accuracy
    df1_sorted = df1.sort_values(by=['validate_accuracy'], ascending=False).head(10)

    return df1_sorted



In [37]:
knn_results = get_knn(X_Train, y_Train, X_val, y_val)
knn_results

Unnamed: 0,neighbors,train_accuracy,validate_accuracy,difference
3,5,0.756219,0.72549,0.030729
5,7,0.721393,0.705882,0.015511
4,6,0.736318,0.686275,0.050044
2,4,0.741294,0.666667,0.074627
6,8,0.716418,0.666667,0.049751
7,9,0.731343,0.666667,0.064677
0,2,0.781095,0.647059,0.134036
1,3,0.746269,0.647059,0.09921


Best performing KNN Model:

5 neighbors

In [39]:
best_knn= pd.DataFrame((knn_results.loc[3]))
best_knn

Unnamed: 0,3
neighbors,5.0
train_accuracy,0.756219
validate_accuracy,0.72549
difference,0.030729


# Modeling Summary

Best performing model overall model was Random Forest with Training and Validate accuracy both at 94%.