# 3) Model Training

In [1]:
import pandas as pd
import numpy as np

In [2]:
heart_disease = pd.read_csv('../00.datasets/heart-diseases.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# 3.1) Fitting the model to the data

* `X`: features, features variables, data
* `y`: labels, target, target variables, ground truth

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Features and Lables
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train model
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

rfc.score(X_test, y_test)

0.8360655737704918

--------

# 3.2) Making Predictions

Two ways to make predictions:

* `predict()`
* `predict_proba()`

### predict()

In [7]:
y_pred = rfc.predict(X_test)

In [8]:
y_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [11]:
rfc.score(X_test, y_test)

0.8360655737704918

In [12]:
# compare predicted labels to true labels to evalute the model
# it is the same as getting score
np.mean(y_pred == y_test)

0.8360655737704918

In [13]:
# accuracy score
from sklearn.metrics import accuracy_score

In [14]:
accuracy_score(y_test, y_pred)

0.8360655737704918

### predict_proba()
- returns probabilities estimates of classification labels

In [18]:
y_pred_proba = rfc.predict_proba(X_test)

y_pred_proba[:5]

array([[0.95, 0.05],
       [0.43, 0.57],
       [0.44, 0.56],
       [0.84, 0.16],
       [0.18, 0.82]])

In [19]:
# check against with predict()
rfc.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

We can see that first prediction is 0, which means no heart disease. 

- For the same prediction row using pred_proba it says 95% for label being 0 (No Heart Disease) and 5% for label being 1 (Heart Disease).
- for the second row, we can see that model is not really confident between two labels, unlike the first one.

In [23]:
# let's check against with true value
y_test[:5]

179    0
228    0
111    1
246    0
60     1
Name: target, dtype: int64

-------------

-------------

## Predictions on Regression Model

In [16]:
from sklearn.datasets import load_boston

In [15]:
boston = load_boston()
# boston

In [13]:
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = pd.DataFrame(boston['target'])

In [14]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
X = boston_df.drop('target', axis=1)
y = boston_df['target']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, random_state=42)

In [22]:
rf = RandomForestRegressor(n_estimators=100)

In [23]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [31]:
y_pred = rf.predict(X_test)

In [33]:
# Evaluation Matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [35]:
MAE = mean_absolute_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))

In [36]:
MAE, RMSE

(1.4308500000000017, 1.8418485958406026)