# Random Forers Model

In [1]:
# Import our dependencies

import pandas as pd
import numpy as np
import sklearn as skl
import pickle
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report,accuracy_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

df = pd.read_csv("Data/2_clean_data_heart.csv")

display(df.head())
display(df.tail())

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
912,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
913,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
914,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
915,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1
916,38,M,NAP,138,175,0,Normal,173,N,0.0,Up,0


## Create the labels set (y) from the “HeartDisease” column, and then create the features (X) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['HeartDisease']

# Separate the X variable, the features
X = df.drop(columns='HeartDisease')

In [4]:
# Review the y variable Series
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [5]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,45,110,264,0,132,1.2,0,1,0,0,0,1,0,1,0,1,0,0,1,0
913,68,144,193,1,141,3.4,0,1,1,0,0,0,0,1,0,1,0,0,1,0
914,57,130,131,0,115,1.2,0,1,1,0,0,0,0,1,0,0,1,0,1,0
915,57,130,236,0,174,0.0,1,0,0,1,0,0,1,0,0,1,0,0,1,0


In [7]:
#The original columns for the dummies have been dropped automatically
X.columns.values.tolist()

['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'Sex_F',
 'Sex_M',
 'ChestPainType_ASY',
 'ChestPainType_ATA',
 'ChestPainType_NAP',
 'ChestPainType_TA',
 'RestingECG_LVH',
 'RestingECG_Normal',
 'RestingECG_ST',
 'ExerciseAngina_N',
 'ExerciseAngina_Y',
 'ST_Slope_Down',
 'ST_Slope_Flat',
 'ST_Slope_Up']

In [8]:
# Review the y variable Series
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

## Check the balance of the labels variable (y) by using the value_counts function.

In [9]:
# Check the balance of our target values
print(y.value_counts())

1    507
0    410
Name: HeartDisease, dtype: int64


## Split the data into training and testing datasets by using train_test_split.

In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

## Predict a Random Forest model

In [11]:
X.columns.values.tolist()

['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'Sex_F',
 'Sex_M',
 'ChestPainType_ASY',
 'ChestPainType_ATA',
 'ChestPainType_NAP',
 'ChestPainType_TA',
 'RestingECG_LVH',
 'RestingECG_Normal',
 'RestingECG_ST',
 'ExerciseAngina_N',
 'ExerciseAngina_Y',
 'ST_Slope_Down',
 'ST_Slope_Flat',
 'ST_Slope_Up']

In [12]:
#verify y
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

### Step 1: Use the StandardScaler to scale the features data, remember that only X_train and X_testing DataFrames should be scaled.

In [13]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
print('Scaler results:', scaler.transform(X_train)[:1])

Scaler results: [[-0.2721222  -0.09107932 -0.39838929  1.80020575 -0.29239116  0.10147671
  -0.51723078  0.51723078  0.92290412 -0.46699583 -0.55324827 -0.22103159
   2.02210012 -1.22400238 -0.50590661 -1.22400238  1.22400238 -0.28016591
   1.02207772 -0.88305874]]


### Step 2: Instantiate the random forest classifier

In [15]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [16]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [17]:
model_filename = "app/Random_Forest_model_final"
joblib.dump(rf_model, model_filename)

['app/Random_Forest_model_final']

In [18]:
# Making predictions using the testing data
predictions_rf = rf_model.predict(X_test_scaled)

In [19]:
# Print the balanced_accuracy score of the model 
print('Balanced Accuracy Score: %.3f' % balanced_accuracy_score(y_test, predictions_rf))

Balanced Accuracy Score: 0.855


In [20]:
# Calculating the confusion matrix
rf_cm_df = pd.DataFrame(
    confusion_matrix(y_test, predictions_rf),index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
rf_cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,82,15
Actual 1,18,115


In [21]:
# Print the classification report for the model
print(classification_report(y_test, predictions_rf))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83        97
           1       0.88      0.86      0.87       133

    accuracy                           0.86       230
   macro avg       0.85      0.86      0.85       230
weighted avg       0.86      0.86      0.86       230

