## Decision Tree Mock Test

In [19]:
#import libraries
import pandas as pd
import numpy as np
from path import Path

In [20]:
# load data
file_path = Path('./Resources/ml_clean_stroke_dataset.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Gender,Ever Married,Smoking Status,Hypertension,Heart Disease,Stroke
0,3,Male,No,unknown,0,0,0
1,58,Male,Yes,never smoked,1,0,0
2,8,Female,No,unknown,0,0,0
3,70,Female,Yes,formerly smoked,0,0,0
4,14,Male,No,unknown,0,0,0


In [21]:
# review how many patients had strokes
df["Stroke"].value_counts()

0    42617
1      783
Name: Stroke, dtype: int64

## Preprocessing data

In [22]:
# Use scikit-learn to encode data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])
df['Ever Married'] = le.fit_transform(df['Ever Married'])
df['Smoking Status'] = le.fit_transform(df['Smoking Status'])
df.head()

Unnamed: 0,Age,Gender,Ever Married,Smoking Status,Hypertension,Heart Disease,Stroke
0,3,1,0,3,0,0,0
1,58,1,1,1,1,0,0
2,8,0,0,3,0,0,0
3,70,0,1,0,0,0,0
4,14,1,0,3,0,0,0


## Split Data into Training and Testing

In [23]:
# Initial imports
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [24]:
# Create our features
X = pd.get_dummies(df.drop(columns="Stroke"))

# Create our target
y= df["Stroke"]

In [25]:
# Define the target
y = df["Stroke"].values
y[:5]

array([0, 0, 0, 0, 0])

#### Preview of the target set indicates five good (no stroke) outcomes.

In [26]:
# split the data into trainin and testing sets; 
# training and testing data sets are 75% and 25%, respectively, of the original data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [27]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32550, 6)
(10850, 6)
(32550,)
(10850,)


## Scaling and Normalization

In [28]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# preview scaled data
X_test_scaled

array([[ 0.65585899,  1.2009291 ,  0.74440875, -1.46014563, -0.32148473,
        -0.22246838],
       [ 0.3901616 , -0.83057709,  0.74440875,  0.36643931, -0.32148473,
        -0.22246838],
       [ 1.58579988, -0.83057709, -1.343348  ,  1.27973179, -0.32148473,
         4.49502077],
       ...,
       [ 0.65585899, -0.83057709,  0.74440875, -0.54685316, -0.32148473,
        -0.22246838],
       [ 0.65585899, -0.83057709,  0.74440875,  0.36643931, -0.32148473,
        -0.22246838],
       [-1.02689119,  1.2009291 , -1.343348  , -1.46014563, -0.32148473,
        -0.22246838]])

In [30]:
# Resample the training data with the RandomOversampler
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=78)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 31953, 1: 31953})

## Make Predictions Using the Testing Data


In [31]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()

# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [32]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [33]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

## Evaluate the Model

In [34]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10641,23
Actual 1,185,1


## Results 
#### - Out of 10,664 no stroke outcomes (Actual 0), 10,641 were predicted to be no stroke (Predicted 0), which are true positives. 
#### - Out of 10,664 no stroke outcomes (Actual 0), 23 were predicted to have stroke (Predicted 1), which are considered false negatives.
#### - Out of 186 stroke outcomes (Actual 1), 185 were predicted to be no stroke (Predicted 0) and are considered false positives.
#### - Out of 186  stroke outcomes (Actual 1), 1 was predicted to be strokes (Predicted 1) and are considered true negatives.

In [36]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10641,23
Actual 1,185,1


Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10664
           1       0.04      0.01      0.01       186

    accuracy                           0.98     10850
   macro avg       0.51      0.50      0.50     10850
weighted avg       0.97      0.98      0.97     10850

