# Instructor Do: Random Forests

In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Data

In [2]:
# Loading data
file_path = Path("covid_19_data.csv")
df = pd.read_csv(file_path)
df.head()

# Our goal in this exercise is to predict if a person has COVID-19
# based on information we have in our provisional dataset and df DataFrame

# the 'outcome_var' column is the dependent, target, output variable that determines 
# whether a person has COVID-19 or not

# All other columns are potential independent, input variables that help to 
# determine the result of the output variable 'bad'

# NOTE: currently the 'bad' column only has the value '0' ?

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [3]:
# Create our X dataset by dropping our target variable from the DataFrame:

# Define features set
X = df.copy()
X = X.drop("outcome_var", axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [10]:
# Create our y dataset


# Define target vector. We reshape our values to fit within a range of -1 & 1
# Doesn't really do anything because all of the values in 'bad' are '0' ?
y = df["outcome_var"].values.reshape(-1, 1)

# Preview data for first 5 values
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [12]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

In [13]:
# Determine the shapes of our training and testing sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [22]:
# Splitting the data into Train and Test sets manually

# Here we're going to split the data into an 80/20 split

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [23]:
# Determine the shapes of our 80/20 training and testing sets
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400, 1)
(100, 1)


In [14]:
# Now that we've split out data into Training and Testing data, let's scale it

# Creating StandardScaler instance
scaler = StandardScaler()

In [15]:
# Fitting Standard Scaler to X_train data
X_scaler = scaler.fit(X_train)

In [17]:
# Scaling (through transformation) our X data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [18]:
# Compute the mean and standard deviation of the first column of the selected data

# Import dependencies
import numpy as np
np.mean(X_train_scaled[:,0])
np.std(X_train_scaled[:,0])
np.mean(X_test_scaled[:,0])
np.std(X_test_scaled[:,0])

# Remember mean should be close to 0 and standard deviation should be close to 1

1.0846495547642092

## Fit (Train) the Random Forest Model

Now let's begin creating and fitting the random forest model:

In [19]:

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# ^ RandomForestClassifier takes a variety or parameters/arguments
# For out purposes we only need the n_estimators and random_state parameters
    # n_estimators allows us to set the number of decision trees created in the algorithm
            # Generally, the higher the n_estimators number the stronger and more stable the predictions
            # However, more n_estimators means the slower the output because the model needs more training time!
            # The Best Practice is to use between 64 and 128 random forests trees
    # Remember, random_state is essentially the save state for our random sample distributions
            # random_state allows us to reproduce the data we're using as long as we use the same random_state value throughout our modeling/data

In [20]:

# Fit the random forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model

In [21]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

# View the data
predictions

# ^ The output of this code is an array of the predictions for all 125 rows of data

# The predictions array may look different if you don't use the same seeding 
# in the random_state parameter

array([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0])

## Model Evaluation

In [22]:
# Calculating the confusion matrix & adapting it into a DataFrame
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# View the data
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,46,29
Actual 1,35,15


In [23]:
# We'll now run an accuracy score test to understand/test how well/often the 
# classifier is correct with the model

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# View the accuracy score data
acc_score

# For mathematical reference, accuracy is calculated as follows:
    # (True Positives + True Negatives) / Total
    # =
    # (TP + TN) / Total
    # =
    # (46+15) / 125
    # = 
    # 61/125 = 0.488

0.488

In [24]:
# Culminate all of the evaluation metrics:

# Displaying results
print("Confusion Matrix:")

display(cm_df)
# ^ NOTE: I've never seen the display() function before: could be similar to print()
# TBD

print(f"Accuracy Score : {acc_score}")
print("Classification Report:")
print(classification_report(y_test, predictions))


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,46,29
Actual 1,35,15


Accuracy Score : 0.488
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.61      0.59        75
           1       0.34      0.30      0.32        50

    accuracy                           0.49       125
   macro avg       0.45      0.46      0.45       125
weighted avg       0.48      0.49      0.48       125



## Summary

In [None]:
# In summary, this model is _____ at predicting whether or not someone has 
# COVID-19 based on the key input features.

## Calculate / Rank the Importance of Features

In [25]:
# To calculate the feature importance(s), we can use the feature_importances_ attribute:

# Calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_

# View the data
importances

# ^ The output of this code returns an array of scores for the features in the 
# X_test set, whose sum is equal to 1.0

array([0.04255262, 0.06925703, 0.43332773, 0.34308962, 0.02003385,
       0.02885374, 0.00316552, 0.02534811, 0.01683591, 0.01753586])

In [28]:
# Now, to SORT the features by their importance baesd on the columns in the X_test 
# set, we can modify our code by applying the sorted() function and instantiate 
# a zip that pairs feature importance to/by the X (input) DataFrame's column names

# We can sort the features by their importances.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

# ^ Running this code will produce a paired zip output that pairs feature importance
# with feature name, the list is also going in descending numerical order based on
# the feature importance score determined by feature_importances_

[(0.4333277349336956, 'age'),
 (0.34308962029655193, 'month_num'),
 (0.06925703114041816, 'term'),
 (0.0425526207790608, 'amount'),
 (0.02885373867063543, 'education_High School or Below'),
 (0.02534810668332584, 'education_college'),
 (0.020033850955949666, 'education_Bachelor'),
 (0.017535860211361443, 'gender_male'),
 (0.01683591263199074, 'gender_female'),
 (0.0031655236970104046, 'education_Master or Above')]

In [None]:
# Now that we can clearly see which features, or columns
# are more relevant than others, we can improve this model by dropping some of
# the lower ranked features from out X dataset and thus from our RandomForestClassifier

# ^ This could improve (or at least impact) the model's performance, accuracy, & precision

# ^ It could also have little to no effect on the model, only one way to find out

## Visualize the Random Forest or Estimators (Individual Decision Trees)

In [None]:
# # Visualize an individual Decision Tree using matplotlib.pyplot
# 
# from sklearn.ensemble import RandomForestClassifier
#
# rf = RandomForestClassifier()
# 
# # first decision tree
# rf.estimators_[0]

In [None]:
# # (May need some tweaking)
# 
# # Visualize an individual Decision Tree using matplotlib.pyplot
# 
# import matplotlib.pyplot as plt
# from sklearn import tree
# 
# fn=X.feature_names
# cn=y.target_names
# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
# 
# # Determine which estimators you want to visualize
# tree.plot_tree(rf.estimators_[0],
#                feature_names = fn, 
#                class_names=cn,
#                filled = True);
# fig.savefig('rf_individualtree.png')

In [None]:
# # (May need some tweaking)
# 
# # Visualize all of the Decision Trees/estimators (AKA the random forest) 
# # using matplotlib.pyplot
# 
# # This may not the best way to view each estimator as it is small
# fn=data.feature_names
# cn=data.target_names
# fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
#
# # Determine the range of trees/estimators that you want to visualize:
#
# for index in range(0, num):
#     tree.plot_tree(rf.estimators_[index],
#                    feature_names = fn, 
#                    class_names=cn,
#                    filled = True,
#                    ax = axes[index]);
# 
#     axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
# fig.savefig('rf_num_trees.png')


## Save the tree as a pdf

In [None]:
# # Saving the tree as PDF
# file_path = Path("../Resources/df_tree.pdf")
# graph.write_pdf(file_path)

# # Saving the tree as PNG
# file_path = Path("../Resources/df_tree.png")
# graph.write_png(file_path)