In [1]:
# Online Learning with River
# https://www.youtube.com/watch?v=P3M6dt7bY9U&list=PLGVZCDnMOq0q7_6SdrC2wRtdkojGBTAht&index=12&ab_channel=PyData

In [2]:
from pprint import pprint
from river import datasets

In [3]:
dataset = datasets.Phishing()

As a quick example, we'll train a logistic regression to classify the website phishing dataset. Here's a look at the first observation in the dataset.

In [4]:
for x, y in dataset:
    pprint(x)
    print(y)
    break

{'age_of_domain': 1,
 'anchor_from_other_domain': 0.0,
 'empty_server_form_handler': 0.0,
 'https': 0.0,
 'ip_in_url': 1,
 'is_popular': 0.5,
 'long_url': 1.0,
 'popup_window': 0.0,
 'request_from_other_domain': 0.0}
True


Now let's run the model on the dataset in a streaming fashion. We sequentially interleave predictions and model updates. Meanwhile, we update a performance metric to see how well the model is doing.

In [6]:
from river import compose
from river import linear_model
from river import metrics
from river import preprocessing

In [7]:
model = compose.Pipeline(
...     preprocessing.StandardScaler(),
...     linear_model.LogisticRegression()
... )


In [8]:
metric = metrics.Accuracy()

In [9]:
metric

Accuracy: 0.00%

In [10]:
for x, y in dataset:
...     y_pred = model.predict_one(x)      # make a prediction
...     metric = metric.update(y, y_pred)  # update the metric
...     model = model.learn_one(x, y)      # make the model learn

In [11]:
metric

Accuracy: 89.20%