# Model Evaluation
Bagaimana melakukan evaluasi pada model kita?

## Task: Regression

In [None]:
y_true = [3, -0.5, 2, 7] # actual data
y_pred = [2.5, 0.0, 2, 8] # prediksi dengan model

### Mean Absolute Error

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_pred)

0.5

### Root Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred, squared=False)

0.6123724356957945

### R2 Score

In [None]:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

0.9486081370449679

## Task: Classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
y_true = [1, 0, 1, 0, 0, 1] # actual data
y_pred = [0, 0, 0, 0, 0, 1] # prediksi data
y_pred_proba = [0.4, 0.3, 0.2, 0.1, 0.15, 0.7] # prediksi data dalam probabilitas

### Confusion Matrix / Classification Result

In [None]:
from sklearn.metrics import confusion_matrix
cf = confusion_matrix(y_true, y_pred)
cf

array([[3, 0],
       [2, 1]])

- Sumbu y: Actual label
- Sumbu x: Prediksi label

### Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.6666666666666666

### Precision & Recall

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_true, y_pred, average='macro')

0.8

In [None]:
recall_score(y_true, y_pred, average='macro')

0.6666666666666666

- micro average hanya melihat label 1 saja untuk dihitung precisionnya: Precision = TP/(TP+FP)
- macro average melihat precision dari kedua label, lalu dihitung rata-ratanya: (Precision 0 + Precision 1)/2

Recall = TP/(TP+FN)

### ROC-AUC

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_true, y_pred_proba)
roc_auc

0.888888888888889

# Cross Validation

## Train & Test Split

In [2]:
!gdown --id 1EUw1Sq7KaX-x6BAMEJ3Pep_fqlncaywf # Download File

Downloading...
From: https://drive.google.com/uc?id=1EUw1Sq7KaX-x6BAMEJ3Pep_fqlncaywf
To: /content/kc_house_data.csv
100% 2.52M/2.52M [00:00<00:00, 16.8MB/s]


In [17]:
import pandas as pd

dataset = pd.read_csv("kc_house_data.csv")
X = dataset[['sqft_living','bedrooms']]
y = dataset[['price']]

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8, random_state=1000000000)

In [20]:
dataset.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650


In [10]:
# 10 baris = 8 baris (train) 2 baris (test)

[a,b,c,d] 2 16 random

1)a,b
2)a,c
3)a,d
4)b,c
5)b,d
6)c,d

In [None]:
(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)

In [5]:
X.shape

(21613, 2)

In [6]:
y.shape

(21613, 1)

In [None]:
dataset.shape

(21613, 21)

In [None]:
Xtrain.shape

(15129, 2)

In [None]:
Xtest.shape

(6484, 2)

In [None]:
Xtrain.head()

Unnamed: 0,sqft_living,bedrooms
167,2680,4
12412,2340,4
7691,2190,4
12460,2260,2
9099,2110,4


In [None]:
ytrain.head()

Unnamed: 0,price
167,807100.0
12412,570000.0
7691,320000.0
12460,649000.0
9099,568000.0


In [None]:
Xtest.head()

Unnamed: 0,sqft_living,bedrooms
735,2070,4
2830,2900,5
4106,3770,4
16218,4560,3
19964,2550,3


## K-Fold Cross Validation

### Check Train & Validation pada masing-masing iterasi

In [None]:
from sklearn.model_selection import KFold # import KFold

import pandas as pd

dataset = pd.read_csv("kc_house_data.csv")
X = dataset[['sqft_living','bedrooms']]
y = dataset[['price']]
kf = KFold(n_splits=3) # Define the split - into 3 folds
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validatorprint(kf)

2

In [None]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # train manual

TRAIN: [10807 10808 10809 ... 21610 21611 21612] TEST: [    0     1     2 ... 10804 10805 10806]
TRAIN: [    0     1     2 ... 10804 10805 10806] TEST: [10807 10808 10809 ... 21610 21611 21612]


## Cross Validation in Validation Set Only

In [None]:
from sklearn.model_selection import cross_val_score
model = LinearRegression()
result = cross_val_score(model, X, y, cv=5, scoring='r2') # calculate score
result.mean()

0.5043547355695521

## Cross Validation in Train Set & Validation Set

In [None]:
# cross validation
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
model = LinearRegression()
score = cross_validate(model, X, y, cv=5, scoring='r2', return_train_score=True)
print('r2 (train): '+ str(score['train_score'].mean()))
print('r2 (test): '+ str(score['test_score'].mean()))

r2 (train): 0.5068353530791848
r2 (test): 0.5043547355695521
