# 3. Fit the model and use it to make prediction on our data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 3.1 Fitting the model to the data

Different name for:
* `X` = features, feature variables, data
* `y` = labels, targets, target variables

### On Classification model

In [6]:
#importing data
heart_df = pd.read_csv('./data/heart-disease.csv')

# creating X, y
X = heart_df.drop('target', axis=1)
y = heart_df['target']

#setup random seed
np.random.seed(42)

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

from sklearn.ensemble import RandomForestClassifier

# model initiation
rfc = RandomForestClassifier(n_estimators=50)

# fit the model to the data (training machine learning model)
rfc.fit(X_train, y_train)

# evaluate the random forest (use model to test on test data to get score)
rfc.score(X_test, y_test)

0.8852459016393442

In [3]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [4]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

## 3.2 Make predictions using machine learning model

2 ways to make predictions:
1. `predict()`
2. `predict_proba()`

In [7]:
X_test.shape

(61, 13)

In [9]:
# predicted outputs on test features
rfc.predict(X_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [10]:
# actual outputs of test features
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [11]:
# compare prediction to truth labels to evaluate the model
y_preds = rfc.predict(X_test)
np.mean(y_preds == y_test)

0.8852459016393442

In [12]:
rfc.score(X_test, y_test)

0.8852459016393442

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8852459016393442

**make prediction with `predict_proba()`**

In [15]:
# probability of a label
rfc.predict_proba(X_test)[:5]

array([[0.9 , 0.1 ],
       [0.5 , 0.5 ],
       [0.44, 0.56],
       [0.8 , 0.2 ],
       [0.16, 0.84]])

In [16]:
rfc.predict(X_test)[:5]

array([0, 0, 1, 0, 1], dtype=int64)

In [18]:
np.array(y_test)[:5]

array([0, 0, 1, 0, 1], dtype=int64)

### On Regression model

**Fitting the model to data**

In [33]:
# import boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()

boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = pd.Series(boston['target'])

from sklearn.ensemble import RandomForestRegressor

#setup random seed
np.random.seed(42)

# create X and y
X = boston_df.drop('target', axis=1)
y = boston_df['target']

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 

#fit the model
model = RandomForestRegressor(n_estimators=81)
model.fit(X_train, y_train)

# check score
model.score(X_test, y_test)

0.8732460715697482

### Make predictions using the trained model

In [41]:
y_preds = model.predict(X_test)
y_preds

array([22.85679012, 31.18271605, 16.87283951, 23.41234568, 17.00493827,
       21.72345679, 19.28518519, 15.25925926, 21.12222222, 20.85555556,
       19.48148148, 19.79012346,  8.91481481, 21.93950617, 19.19753086,
       26.15061728, 19.45061728,  8.0382716 , 45.35925926, 14.56296296,
       24.58395062, 23.98148148, 14.52839506, 22.90987654, 14.92469136,
       14.40246914, 21.05925926, 14.10617284, 19.27407407, 20.68518519,
       19.19506173, 23.28765432, 30.45802469, 20.48024691, 14.35185185,
       15.92469136, 34.51604938, 19.1308642 , 20.6308642 , 24.49753086,
       18.66419753, 29.8345679 , 45.25925926, 19.34814815, 22.02098765,
       13.62962963, 15.38518519, 24.62345679, 18.8962963 , 28.30123457,
       21.40987654, 33.85432099, 17.21728395, 26.18395062, 45.15185185,
       22.06049383, 15.75061728, 32.36419753, 22.21851852, 20.17654321,
       25.39012346, 34.24197531, 28.84567901, 18.79135802, 27.00123457,
       17.35679012, 13.67283951, 23.10493827, 28.35679012, 15.95

In [42]:
np.array(y_test)

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8, 21.5,
       18.9,  7. , 21.2, 18.5, 29.8, 18.8, 10.2, 50. , 14.1, 25.2, 29.1,
       12.7, 22.4, 14.2, 13.8, 20.3, 14.9, 21.7, 18.3, 23.1, 23.8, 15. ,
       20.8, 19.1, 19.4, 34.7, 19.5, 24.4, 23.4, 19.7, 28.2, 50. , 17.4,
       22.6, 15.1, 13.1, 24.2, 19.9, 24. , 18.9, 35.4, 15.2, 26.5, 43.5,
       21.2, 18.4, 28.5, 23.9, 18.5, 25. , 35.4, 31.5, 20.2, 24.1, 20. ,
       13.1, 24.8, 30.8, 12.7, 20. , 23.7, 10.8, 20.6, 20.8,  5. , 20.1,
       48.5, 10.9,  7. , 20.9, 17.2, 20.9,  9.7, 19.4, 29. , 16.4, 25. ,
       25. , 17.1, 23.2, 10.4, 19.6, 17.2, 27.5, 23. , 50. , 17.9,  9.6,
       17.2, 22.5, 21.4])

In [46]:
# mean absolute error menually 
abs(y_preds - np.array(y_test)).sum()/y_preds.shape

array([2.13882837])

In [47]:
# mean absolute error sklearn library
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

2.1388283708545157