# Introduction to Scikit-Lean (sklearn)

This notebook demonstrates some of the most useful functions of the beautiful SciKit-Learn library.

What we're going to cover:

In [2]:
# let's listify the content
what_were_covering = [
"0. An end-to-end SckiKit-Learn workflow",
"1. Getting the data ready",
"2. Choose the right estimator/algorithm for our problems",
"3. Fit the model/algorith and use it to make predictions on our data",
"4. Evaluating a model",
"5. improve a model",
"6. save and load a trained model",
"7. putting it all together!",
]

In [3]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 0. An end-to-end SciKit-Learn workflow

In [4]:
# 1. Get the data ready

heart_disease = pd.read_csv("./data/heart-disease.csv");
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
# create X (features matrix)
x = heart_disease.drop("target", axis=1)

# create y (labels)
y = heart_disease["target"]


In [6]:
# 2 Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
# Fit the model to the training data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [8]:
clf.fit(x_train, y_train);

In [9]:
# make a prediction
y_label = clf.predict(np.array[0,2,3,4])

TypeError: 'builtin_function_or_method' object is not subscriptable

In [None]:
y_preds = clf.predict(x_test)

In [None]:
y_preds

In [None]:
y_test

In [None]:
# 4. Evaluate the model on the training data and test data
clf.score(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5. Improve a model
# try different amount of n_estimators

np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test) * 100:.2f}")
    print("")

In [None]:
# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

## 1. Getting our data ready to be used with machine learning
Three main things we have to do:
* 1. Split the data into features and labels (usually "x" & "y")
  2. Filling (also called imputing) or disregarding missing values
  3. converting non-numerical values to numerical values (also called feature encoding)

In [None]:
heart_disease.head()

In [None]:
# prepare the data
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [None]:
# split the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, y_train, x_test, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x.shape

## 1.1 Make sure it's all numerical

In [None]:
car_sales = pd.read_csv("./data/car-sales-extended.csv")
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# split into x/y
x = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]
y

In [None]:
# split into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_train

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [("one_hot", one_hot, categorical_features)],
    remainder="passthrough"
)
transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
pd.DataFrame(transformed_x)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Let's refit the model
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)
model.fit(x_train, y_train)


In [None]:
model.score(x_test, y_test)

### 1.2 What if there were missing values?

1. Fill them with some values (aka imputation).
2. Remove the samples with missing data altogether.

In [None]:
# import car sales missing data
car_sales_missing = pd.read_csv("./data/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

In [None]:
# Create x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [("one_hot", one_hot, categorical_features)],
    remainder="passthrough"
)
transformed_x = transformer.fit_transform(x)
transformed_x

### Option 1: Fill missing data with Pandas

In [None]:
car_sales_missing.head(10)

In [None]:
car_sales_missing["Doors"].value_counts()

In [None]:
# Fill the "Make" column
car_sales_missing["Make"] = car_sales_missing["Make"].fillna("missing")

# Fill the "Colour" column
car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna("missing")

# Fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(round(car_sales_missing["Odometer (KM)"].mean(), 1))

# Fill the "Doors" column
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(4)

In [None]:
# check dataframe again
car_sales_missing.isna().sum()

In [None]:
# Remove rows with missing price value
car_sales_missing = car_sales_missing.dropna()

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [None]:
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [("one_hot", one_hot, categorical_features)],
    remainder="passthrough"
)
transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

### Option 2: Fill missing values with SciKit-Learn

In [None]:
car_sales_missing = pd.read_csv("./data/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

In [None]:
# Drop rows with no price
car_sales_missing = car_sales_missing.dropna(subset=["Price"])

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Split into x & y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Fill missing values with SciKit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with "missing" & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data
filled_x = imputer.fit_transform(x)
filled_x

In [None]:
car_sales_filled = pd.DataFrame(
    filled_x, 
    columns=["Make", "Colour", "Doors", "Odometer (KM)"]
)
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [("one_hot", one_hot, categorical_features)],
    remainder="passthrough"
)
transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x

In [None]:
# Now we've got our data as numbers and filled (no missing values)
# Let's fit a model

np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    transformed_x,
    y,
    test_size=0.2
)

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

## 2. Choosing the righr estimator/algorithm for your problem

Some things to note:

* Sklearn refers to machine learning models, algorithms as estimators.
* Classification problem - predicting a category (heart disease or not)
    * Sometimes you'll see "clf" (short for classifier) used as a classification estimator
* Regression problem - predicting a number (selling price of a car)

If you're working on a machine learning problem using Sklearn and not sure what model to use, refer to the sklearn machine learning map:
https://scikit-learn.org/stable/machine_learning_map.html

### 2.1 Picking a machine learning model for a regression problem
Let's use the California housing dataset - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In [12]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [13]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [15]:
housing_df["target"] = housing["target"]
# housing_df = housing_df.drop("MedHouseVal", axis=1)
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# Ridge Regression Model

# import algorithm/estimator
from sklearn.linear_model import Ridge
# setup random seed
np.random.seed(42)

# create the data
x = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# instantiate an fit model (on the training set)
model = Ridge()
model.fit(x_train, y_train)

# score the model
model.score(x_test, y_test)

In [None]:
# Lasso Model

# import algorithm/estimator
from sklearn.linear_model import Lasso

# setup random seed
np.random.seed(42)

# create the data
x = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# instantiate and fit a model
model = Lasso()
model.fit(x_train, y_train)

# score model
model.score(x_test, y_test)

### Ensemble model
An ensemble is a combination of smaller models to try and make better predictions than just a single model
Sklearn's ensemble models can be found here: https://scikit-learn.org/stable/modules/ensemble.html

In [None]:
# Import the RandomForestRegressor model class from the ensemble module 
from sklearn.ensemble import RandomForestRegressor

# setup random seed
np.random.seed(42)

# create the data
x = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# create random forest model
model = RandomForestRegressor()
model.fit(x_train, y_train)

# score the model
model.score(x_test, y_test)

### 2.2 Picking a machine learning model for a classification problem

In [None]:
# import wine dataset
from sklearn.datasets import load_wine
data = load_wine()
data

In [None]:
wine_df = pd.DataFrame(data["data"], columns=data["feature_names"])
wine_df["target"] = data["target"]
wine_df

In [None]:
# import LinearSVC model
from sklearn.svm import LinearSVC

# setup random seed
np.random.seed(42)

# create the data
x = wine_df.drop("target", axis=1)
y = wine_df["target"]

# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# create the model
model = LinearSVC()
model.fit(x_train, y_train)

# score the model
model.score(x_test, y_test)

In [None]:
heart_disease = pd.read_csv("./data/heart-disease.csv")
heart_disease.head(5)

In [None]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

# setup random seed
np.random.seed(42)

# create the data
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# create the model
clf = LinearSVC()
clf.fit(x_train, y_train)

# Evaluate the LinearSVC
clf.score(x_test, y_test)

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# create the data
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# create the model
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

# Evaluate the Random Forest Classifier
clf.score(x_test, y_test)

Tidbit:

    * 1. If you have structured data, use ensemble methods (tables)
    * 2. If you have unstructured data, use deep learning or transfer learning (images, audio, text)

## 3. Fit the model/algorithm on our data and use it to make predictions

### 3.1 Fitting the model to the data

Different names for:
X = features, features variables, data
y = labels, targets, target variables

In [None]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# create the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the model to the data (training the machine learning model)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learned)
clf.score(X_test, y_test)

### 3.2 Make predictions using a machine learning model

2 ways to make predictions:
1. "predict()"
2. "predict_proba()"

In [None]:
# Use a trained model to make predictions
clf.predict(X_test)

In [None]:
np.array(y_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

Make predictions with `predict_proba()`

In [None]:
#  predict_proba() return probabilities of a classfification label

In [None]:
clf.predict_proba(X_test[:5])

In [None]:
# Let's predict on the same data...
clf.predict(X_test[:5])

`predict()` can also be used for regression models.

In [None]:
housing_df.head()

In [None]:
# Import the RandomForestRegressor model class from the ensemble module 
from sklearn.ensemble import RandomForestRegressor

# setup random seed
np.random.seed(42)

# create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create random forest model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# score the model
model.score(X_test, y_test)

In [None]:
# Make predictions
y_preds = model.predict(X_test)
y_preds[:10]

In [None]:
np.array(y_test[:10])

In [None]:
np.mean(y_preds == y_test)

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

## 4. Evaluating a machine learning model

Three ways to evaluate SciKit-Learn models/estimators:
1. Estimator's built in `score()` method
2. The `scoring` parameter
3. Problem specific metric function

You can read more about these here: https://scikit-learn.org/stable/modules/model_evaluation.html

### 4.1 Evaluating a model with the `score()` method

In [None]:
# Classifier algorithm

# import model
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# create the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

#score the model
clf.score(x_test, y_test)


In [None]:
# Regressor algorithm

# import the model
from sklearn.ensemble import RandomForestRegressor

# setup random seed
np.random.seed(42)

# create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# score the model
model.score(X_test, y_test)

### 4.2 Evaluating a model using `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score

# import model
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# create the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train);

In [None]:
clf.score(X_test, y_test)

In [None]:
cross_val_score(clf, X, y)

In [None]:
# compare single score and cross val score
np.random.seed(42)

# Single training and test split score
clf_single_score = clf.score(X_test, y_test)

# Take the mean of 5-fold cross-validation score
clf_cross_val_score = np.mean(cross_val_score(clf, X, y))

# compare the two
clf_single_score, clf_cross_val_score

In [None]:
# default scoring parameter of classifier = mean accuracy
clf.score()

In [None]:
# Scoring parameter set to None by default
cross_val_score(clf, X, y, scoring=None)

### 4.2.1 Classification model evaluation metrics
1. Accuracy
2. Area under ROC curve
3. confusion matrix
4. classification report

**Accuracy**

In [None]:
# import classifier model 
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# create the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

## score the model
clf = RandomForestClassifier()
cross_val_score = cross_val_score(clf, X, y)

In [None]:
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) * 100:.2f}%")

**Area under the receiver operating characteristic curve (AUC/ROC)**

* Area under curve(AUC)
* ROC curve

ROC curves are a comparison of a model's true positive rate (tpr) versus a models false positive rate (fpr).

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
# create train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# fit the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train);

In [None]:
from sklearn.metrics import roc_curve

# Make predictions with probabilities
y_probs = clf.predict_proba(X_test)

In [None]:
y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:, 1]

In [None]:
y_probs_positive

In [None]:
# calculate fpr, tpr, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# check the false positive rate
fpr

In [None]:
# create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr) and true positive rate (tpr) of a model.
    """
    # plot the roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")

    # plot line with no predicitve power (baseline)
    plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing")

    # customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristics (ROC) curve")
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

In [None]:
# plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# perfect auc score
roc_auc_score(y_test, y_test)

**Confusion Matrix**

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict.

In essence, giving you  an idea of where the model is getting confused.

SciKit documentation: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
# visualize confusion matrix with pd.crosstab()
pd.crosstab(
    y_test,
    y_preds,
    rownames=["Actual Labels"],
    colnames=["Predict Labels"]
)

In [None]:
# install seaborn to conda environment
import sys
!conda install --yes --prefix {sys.prefix} seaborn 

In [None]:
# OLD VERSION
# make confusion matrix more visual with seaborn's heatmap
import seaborn as sns

# set the font scale
sns.set(font_scale=1.5)

# create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# plot using seaborn
sns.heatmap(conf_mat);

### Creating a confusion matrix using Scikit-Learn

To use the new methods of creating a confusion matrix with SkiKit-Learn you will need sklearn version 1.0+

In [None]:
# check sklearn version
sklearn.__version__

In [None]:
# confusion matrix from estimator (without predictions)
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(clf, X, y);

In [None]:
# confusion matrix from estimator (without predictions)
from sklearn.metrics import ConfusionMatrixDisplay

y_preds = clf.predict(X_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_preds);

**Classification Report**

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0

In [None]:
pd.DataFrame(classification_report(
    disease_true,
    disease_preds,
    output_dict=True
))

### 4.2.2 Regression model evaluation metrics

Model evaluation metrics - documentation: https://scikit-learn.org/stable/modules/model_evaluation.html

The ones we're going to cover are:
1. R^2 or coefficient of determination
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

In [None]:
# import regressor model
from sklearn.ensemble import RandomForestRegressor

# setup random seed
np.random.seed(42)

# create the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create the model
model = RandomForestRegressor()
model.fit(X_train, y_train);


In [None]:
# Score the model
model.score(X_test, y_test)

In [None]:
y_test.mean()

In [None]:
# r^2 evaluation metric
# Compares your models predictions to the mean of the targets.
from sklearn.metrics import r2_score

# fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_preds = model.predict(X_test)

In [None]:
r2_score(y_test, y_preds)

***Mean Absolute Error (MAE)***

MAE is the average of the aboslute differences between predictions and actual values.

It gives you an idea of how wrong your models predictions are.

In [None]:
# MAE mean absolute error evaluation metric
from sklearn.metrics import mean_absolute_error

# make predictions from model
y_preds = model.predict(X_test)

# calculate MAE
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
# create dataframe with actual and predicted values
df = pd.DataFrame(data={
    "actual values": y_test,
    "predicted values": y_preds 
})
# calculate absolute difference between actual and predicted values
df["differences"] = abs(df["actual values"] - df["predicted values"])
df.head()

In [None]:
# MAE using formulas and differences
df["differences"].mean()

***Mean Squared Error (MSE)***

MSE is the mean of the square of the errors between actual and predicted values.

In [None]:
# MSE mean squared error evaluation metric
from sklearn.metrics import mean_squared_error

# make predictions with model
y_preds = model.predict(X_test)

# calculate MSE
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
# create dataframe with actual and predicted values
df = pd.DataFrame({
    "actual values": y_test,
    "predicted values" : y_preds
})

In [None]:
# calculate squared value of difference betwen actual and predicted values
df["squared difference"] = np.square(df["actual values"] - df["predicted values"])

In [None]:
# Calcualte MSE, average of squared difference    
df["squared difference"].mean()

## Machine Learning Model Evaluation

Evaluating the results of a machine learning model is as important as building one.

These are some of the most important evaluation metris you'll want to look into for classification and regression models.

### Classification Model Evaluation Metrics/Techniques

* `Accuracy` - The accuracy of the model in decimal format. Perfect accuracy is equal to 1.0
* `Precision` - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0
* `Recall` - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0
* `F1 Score` - A combination of precision and recall. A perfect model achieves an F1 score of 1.0
* `Confusion matrix` - Compares the predicted values with the true values in a tabular, if 100% correct, all values in the atrix will be top left to bottom right (diagonal line)
* `Cross-validation` - Splits yur dataset into multiple parts and train and tests your model on each part then evaluates performance as an average
* `Classification report` - Sklearn has a built-in function called `classification_report()` which returns some of the main classification metrics such as precision, recall and f1-score
* `ROC Curve` - Also known as receiver operating characteristic is a plot of true positive rate versus false-positive rate
* `Area Under Curve (AUC) Score` - The area underneath the ROC curve. A perfect model achieves an AUC score of 1.0

### Which classification metric should you use?

* `Accuracy` is a good measure to start with if all classes are balanced (e.g. same amount of samples which are labelled with 0 or 1)
* `Precision` and `Recall` become more important when classes are imbalanced
* If false-positive predictions are worse than false-negatives, aim for higher `precision`
* If false-negative predictions are worse than false-positives, aim for higher `recall`
* `F1-score` is a combination of precision and recall
* A `confusion matrix` is always a good way to visualize how a classification model is going

### Regression Model Evaluation Metrics/Techniques

* `R^2 or the coefficient of determination` - Compares your model's predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1. For example, if all your model does is predict the mean of the targets, its R^2 value would be 0, if your model perfectly predicts a range of numbers it's R^2 value would be 1
* `Mean absolute error (MAE)` - The average of the absolute differences between actual and predicted values. It gives you an idea of how wrong your predictions were
* `Mean squared error (MSE)` - The average squared difference between predictions and actual values. Squaring the errors removes negative errors. It also amplifies outliers (samples which have larger errors)

### Which regression metric should you use? 

* `R2` is similar to accuracy. It gives you a quick inidcation of how well your model might be doing. Generally, the closer your `R2` value is to 1.0, the better the model. But it doesn't really tell exactly how wrong your model is in terms of how far off each prediction is
* `MAE` gives a better indication of how far off each of your model's predictions are on average
* As for `MAE` or `MSE`, because of the way MSE is calculated, squaring the differences between predicted values and actual values, it amplifies larger differences, Let's say we're predicting the value of houses:
    * Pay more attention to `MAE`: When being \$10,000 off is *twice* as bad as being \$5,000 off
    * Pay more attention to `MSE`: when being \$10,000 off is *more than twice* as bad as being \$5,000 off

### 4.2.3 Using the `scoring` parameter

In [None]:
# import the model
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# create the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create the model
clf = RandomForestClassifier()

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, X, y, scoring="accuracy") #scoring=None uses models default: accuracy
cv_acc

In [None]:
# Cross-validation accuracy
print(f"The Cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)

# Cross-validation precision
cv_precision = cross_val_score(clf, X, y, scoring="precision")
print(f"The Cross-validated precision is: {np.mean(cv_precision)*100:.2f}%")

In [None]:
cv_precision

In [None]:
np.random.seed(42)

# cross-validation recall
cv_recall = cross_val_score(clf, X, y, scoring="recall")
print(f"The Cross-validated recall is: {np.mean(cv_recall)*100:.2f}%")
cv_recall

In [None]:
# import the model
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# setup random seed
np.random.seed(42)

# Craete the data
X = housing_df.drop("target", axis=1)
y = housing_df["target"]

# split the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()

In [18]:
np.random.seed(42)

cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None)
np.mean(cv_r2)

np.float64(0.6545660727379677)

In [26]:
cv_r2

array([0.62156808, 0.72076221, 0.62136792])

In [29]:
# Mean absolute error
cv_mae = cross_val_score(model, X, y, cv=3, scoring="neg_mean_absolute_error")
cv_mae
np.mean(cv_mae)

np.float64(-0.4811411446705427)

In [31]:
# Mean squared error 
cv_mse = cross_val_score(model, X, y, cv=3, scoring="neg_mean_squared_error")
cv_mse
np.mean(cv_mse)

np.float64(-0.463292458220971)