In [2]:
import numpy as np
import pandas as pd

## 3. Fit the model/algo on our data and use it to make predictions

### 3.1. Fitting the code to the data

In [3]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


* X = features, features varibales, data
* y = targets, target variables

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Setup random seed
np.random.seed(42)

# Create X and y data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

# Instantiate RandomForestClf
clf = RandomForestClassifier()

# .fit tries to find all the patters resulting into 1 or 0 (training ml model)
clf.fit(X_train, y_train)

# Evaluate the data (uses the patterns the model has learned)
clf.score(X_test, y_test)

0.8524590163934426

In [5]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


### 3.2 Make predictions using the ML model

2 Ways to create predictions
1. `predict()`
2. `predict_proba`

In [6]:
# Use a trained model to make predictions
clf.predict(np.array([1,7,8,3,4])) # this will not work

ValueError: Expected 2D array, got 1D array instead:
array=[1. 7. 8. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [7]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [8]:
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [9]:
np.array([y_test])

array([[0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
        0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [10]:
# Compare predcitions to truth(y) labels to evaluate the model
# In this case we are basing our prediction on X_test using the .predict
# We uses the y_test as the truth 

# We assigned the predictions to y_preds, which is the conventional way to name it
y_preds = clf.predict(X_test)

# We use np.mean() to evaluate our prediction
np.mean(y_preds == y_test)

0.8524590163934426

In [11]:
# The piece of code above is outputs the same result as 
clf.score(X_test, y_test)

0.8524590163934426

In [12]:
# Another way to evaluate our predictions
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

In [13]:
y_preds

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

# Make predictions with `predict_proba()`


In [15]:
# predict_proba returns probabilities of a classification label
clf.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [16]:
# Let's predict() on the same data
clf.predict(X_test[:5])

array([0, 1, 1, 0, 1], dtype=int64)

In [None]:
# predict_proba gives the probabilities
# proba is better to analyze the labels
# predict is better for a simple output like heart disease or no heart disease

In [18]:
X_test[:5]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [17]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

`predict()`can also be used for regression model

In [33]:
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])

In [34]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create boston data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split train test data
X_test, X_train, y_test, y_train = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

# Instantiate the model
rf = RandomForestRegressor()

# Train the model
rf.fit(X_train,  y_train)


# Make predictions
y_preds = rf.predict(X_test)

In [35]:
y_preds[:10]

array([10.602, 21.753, 24.429, 14.095, 20.358, 25.265, 23.793, 25.109,
       10.952, 15.502])

In [37]:
np.array(y_test[:10])

array([12. , 19.9, 19.4, 13.4, 18.2, 24.6, 21.1, 24.7,  8.7, 27.5])

In [38]:
# Compare the predictions to the truth ( y_test)
from sklearn.metrics import mean_absolute_error

# Goes through every predictions compares it with the test and figure out 
# the differences for each sample
mean_absolute_error(y_test, y_preds)

2.8664950495049513

In [None]:
# The number above is saying that the average number of predictions
# has a 2.8 differnece from the target

## 4. Evaluating ML models

Three different ways to evaluate Sklearn model/estimators
1. Estimator `score` method
2. The `scoring` parameter
3. Problem-specific metrics functions


### 4.1 Evaluating the method the `score` method

In [45]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
clf.score(X_train, y_train)

1.0

In [48]:
# Returns the average accuracy between test data and labels
clf.score(X_test, y_test)

0.8524590163934426

Doing the same for regression

In [50]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create boston data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split train test data
X_test, X_train, y_test, y_train = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

# Instantiate the model
rf = RandomForestRegressor().fit(X_train,  y_train)

In [51]:
# When we call score on the regression model it does not return the average
# It returns the coefficient of determination R^2 of the prediction
rf.score(X_test, y_test)

0.8238769438502933