## 4. Fit the Model/Estimator/Algorithm
Fitting the model/estimator/algorithm to make predictions on the data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# import the Heart Disease Data
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

### 4.1 Data splitting

Splitting the data into features and labels that can further help us train the model
* `X` = data features, feature variables
* `y` = target data, labels

In [134]:
# setting up a seed
np.random.seed(42)
X = heart_disease.drop("target", axis=1)
y = heart_disease.target

# splitting the data into train & test sets
from sklearn.model_selection import train_test_split as splitter
X_train, X_test, y_train, y_test = splitter(X, y, test_size=0.2)

# importing the Random Classifier for our problem
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# scoring the model
clf.score(X_test, y_test)

0.8524590163934426

### 4.1 Making Predictions using Model

2 ways to make predictions:
1. `predict()`
2. `predict_proba()`

In [135]:
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [136]:
# comparison prediction data vs truth OR y_test data
comparison_df = pd.DataFrame(np.array(y_test), columns=["Y_TRUE"])
comparison_df["Y_PRED"] = clf.predict(X_test)
comparison_df["COMP"] = comparison_df["Y_TRUE"] == comparison_df["Y_PRED"]

In [137]:
comparison_df[comparison_df.COMP == False].head()

Unnamed: 0,Y_TRUE,Y_PRED,COMP
1,0,1,False
11,1,0,False
20,1,0,False
22,0,1,False
25,0,1,False


In [138]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8524590163934426

In [132]:
# shows the probability of a prediction in either case 
pred_proba = clf.predict_proba(X_test)
pred_proba[:5]

array([[0.47, 0.53],
       [0.52, 0.48],
       [0.47, 0.53],
       [0.47, 0.53],
       [0.47, 0.53]])

### 4.2 Making Predictions on a Regression Model

In [139]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston

boston = load_boston()

In [140]:
# creating a dataframe from boston data
boston_df = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])
boston_df['TARGET'] = boston['target']

In [141]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TARGET
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [142]:
np.random.seed(42)
X = boston_df.drop('TARGET', axis=1)
y = boston_df.TARGET

# split data for train and test samples
X_train, X_test, y_train, y_test = splitter(X, y, test_size=0.2)

from sklearn.ensemble import RandomForestRegressor
clf_reg = RandomForestRegressor()

clf_reg.fit(X_train, y_train)
clf_reg.score(X_test, y_test)


0.8654448653350507

In [143]:
y_pred = clf_reg.predict(X_test)

In [144]:
y_pred[:10]

array([23.081, 30.574, 16.759, 23.46 , 16.893])

In [146]:
np.array(y_test[:10])

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [161]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
round(mean_absolute_error(y_test, y_pred), 2), round(mean_absolute_percentage_error(y_test, y_pred) * 100, 2)

(2.14, 11.53)