## Random Forest Mock Test

In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# load data
file_path = Path('../Resources/ml_clean_stroke_dataset.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Gender,Ever Married,Smoking Status,Hypertension,Heart Disease,Stroke
0,3,Male,No,unknown,0,0,0
1,58,Male,Yes,never smoked,1,0,0
2,8,Female,No,unknown,0,0,0
3,70,Female,Yes,formerly smoked,0,0,0
4,14,Male,No,unknown,0,0,0


In [3]:
# review how many patients had strokes
df["Stroke"].value_counts()

0    42617
1      783
Name: Stroke, dtype: int64

## Preprocessing data

In [4]:
# Use scikit-learn to encode data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])
df['Ever Married'] = le.fit_transform(df['Ever Married'])
df['Smoking Status'] = le.fit_transform(df['Smoking Status'])
df.head()

Unnamed: 0,Age,Gender,Ever Married,Smoking Status,Hypertension,Heart Disease,Stroke
0,3,1,0,3,0,0,0
1,58,1,1,1,1,0,0
2,8,0,0,3,0,0,0
3,70,0,1,0,0,0,0
4,14,1,0,3,0,0,0


In [6]:
# 1. Define the features set.
X = df.copy()
X = X.drop("Stroke", axis=1)
X.head()

Unnamed: 0,Age,Gender,Ever Married,Smoking Status,Hypertension,Heart Disease
0,3,1,0,3,0,0
1,58,1,1,1,1,0
2,8,0,0,3,0,0
3,70,0,1,0,0,0
4,14,1,0,3,0,0


In [8]:
# 2. Define the target set
y = df["Stroke"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [9]:
# 3. Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
# 4. Creating a StandardScaler instance
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create our features
X = pd.get_dummies(df.drop(columns="Stroke"))

# Create our target
y= df["Stroke"]

In [None]:
# Define the target
y = df["Stroke"].values
y[:5]

#### Preview of the target set indicates five good (no stroke) outcomes.

## Fit the Random Forest model

#### The n_estimators will allow us to set the number of trees that will be created by the algorithm. Generally, the higher number makes the predictions stronger and more stable, but can slow down the output because of the higher training time allocated. The best practice is to use between 64 and 128 random forests

In [11]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [12]:
# Fitting the model to training set
rf_model = rf_model.fit(X_train_scaled, y_train)

## Make Predictions Using the Testing Data


In [17]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [18]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

## Evaluate the Model

In [20]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10648,16
Actual 1,185,1


## Results 
#### - Out of 10,6680  no stroke outcomes (Actual 0), 10,648 were predicted to be no stroke (Predicted 0), which are true positives. 
#### - Out of 10,6680 no stroke outcomes (Actual 0), 16 were predicted to have stroke (Predicted 1), which are considered false negatives.
#### - Out of 186 stroke outcomes (Actual 1), 185 were predicted to be no stroke (Predicted 0) and are considered false positives.
#### - Out of 186  stroke outcomes (Actual 1), 1 was predicted to be strokes (Predicted 1) and are considered true negatives.

In [21]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10648,16
Actual 1,185,1


Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10664
           1       0.06      0.01      0.01       186

    accuracy                           0.98     10850
   macro avg       0.52      0.50      0.50     10850
weighted avg       0.97      0.98      0.97     10850

