In [50]:
# General
import pandas as pd
import time

# CapyMOA 
from capymoa import datasets
from capymoa.classifier import AdaptiveRandomForestClassifier, HoeffdingAdaptiveTree, HoeffdingTree, LeveragingBagging, KNN, NaiveBayes, OnlineAdwinBagging, OnlineBagging, SGDClassifier
from capymoa.evaluation import prequential_evaluation

# River
from river import stream
from river import ensemble
from river import evaluate
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing
from river import naive_bayes
from river import tree
from river import forest

## Part I - Binary Classifiers

The classifiers I found on both River and CapyMOA were : 
- ADWIN Bagging, (capymoa OnlineAdwinBagging)
- Adaptive Random Forest,
- Bagging, 
- Hoeffding Adaptive Tree, 
- Hoeffding Tree, 
- Leveraging Bagging, 
- Naive Bayes, 
- k-Nearest Neighbors, 
- sklearn SGDClassifier

We will compare both libraries on these classifiers.

The datasets we will use are from CapyMOA and are the following : 
- Electricity
- Electricity tiny

### I.1) Getting the data from CapyMOA 

In [40]:
elec_stream = datasets.Electricity()
variables = ['1', '2', '3', '4', '5', '6', '7','8' ] #XXX - get the real variables

data = []
while elec_stream.has_more_instances():
    inst = {}
    instance = elec_stream.next_instance()
    x = instance.x
    y_index = instance.y_index
    for i in range (len(x)) : 
        inst[variables[i]] = x[i]
    inst['target'] = y_index
    data.append(inst)

df = pd.DataFrame(data)
df.to_csv('elec_stream.csv', index=False, sep = ',')



In [66]:
elec_stream_tiny = datasets.ElectricityTiny()
variables = ['1', '2', '3', '4', '5', '6', '7','8' ] #XXX - get the real variables

data = []
while elec_stream_tiny.has_more_instances():
    inst = {}
    instance = elec_stream_tiny.next_instance()
    x = instance.x
    y_index = instance.y_index
    for i in range (len(x)) : 
        inst[variables[i]] = x[i]
    inst['target'] = y_index
    data.append(inst)

df = pd.DataFrame(data)
df.to_csv('elec_stream_tiny.csv', index=False, sep = ',')


electricity_tiny.arff: 24.0kB [00:01, 17.8kB/s]                                 


In [68]:
elec_stream_tiny

<capymoa.datasets._datasets.ElectricityTiny at 0x16a04d250>

## I.2) Evaluating with CapyMOA

In [42]:
Datasets_capymoa = {'Electricity' : elec_stream, 'Electricity Tiny': elec_stream_tiny}
Models_capymoa = {'ADWIN bagging': OnlineAdwinBagging,
         'Adaptive Random Forest': AdaptiveRandomForestClassifier ,
         'Bagging': OnlineBagging,
         'Hoeffding Adaptive Tree': HoeffdingAdaptiveTree ,
         'Hoeffding Tree': HoeffdingTree,
         'Leveraging Bagging': LeveragingBagging,
         'Naive Bayes': NaiveBayes,
         'k-Nearest Neighbors': KNN,
         'SGDClassifier': SGDClassifier}
#metrics_river = {'F1' : metrics.F1(), 'Accuracy' : metrics.Accuracy()}

def get_performance_capymoa(Models, Datasets):
    results = []
    for model_name in Models.keys():
        model = Models[model_name]
        for data_name in Datasets.keys():
            stream = Datasets[data_name]
            print(f"CapyMOA : Model {model_name} and dataset {data_name}")
            result_model = dict()
            beginning = time.time()
            
            classifier = model(stream.get_schema())
            res = prequential_evaluation(stream, classifier)
            end = time.time()
            
            result_model = {'Model': model_name,
                    'Dataset': data_name,
                    'Time (seconds)': round(end - beginning, 4),
                    'Accuracy': res['cumulative'].accuracy(),
                    'F1': res['cumulative'].f1_score()}
            results.append(result_model)
    return results

results_capymoa = get_performance_capymoa(Models_capymoa, Datasets_capymoa)


Model ADWIN bagging and dataset Electricity
Model Adaptive Random Forest and dataset Electricity
Model Bagging and dataset Electricity
Model Hoeffding Adaptive Tree and dataset Electricity
Model Hoeffding Tree and dataset Electricity
Model Leveraging Bagging and dataset Electricity
Model Naive Bayes and dataset Electricity
Model k-Nearest Neighbors and dataset Electricity
Model SGDClassifier and dataset Electricity


In [46]:
pd.DataFrame(results_capymoa)

Unnamed: 0,Model,Dataset,Time (seconds),Accuracy,F1
0,ADWIN bagging,Electricity,6.8971,84.939972,84.513691
1,Adaptive Random Forest,Electricity,45.5801,90.066649,89.807241
2,Bagging,Electricity,4.9514,82.825742,82.348786
3,Hoeffding Adaptive Tree,Electricity,0.2058,83.907133,83.458512
4,Hoeffding Tree,Electricity,0.1251,81.726695,81.253257
5,Leveraging Bagging,Electricity,17.8204,89.159605,88.871276
6,Naive Bayes,Electricity,0.0787,73.362465,72.83912
7,k-Nearest Neighbors,Electricity,2.1488,84.08148,83.680705
8,SGDClassifier,Electricity,7.7692,84.180791,83.801233


In [52]:
adwin = ensemble.ADWINBaggingClassifier(model=(preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,  seed=42)
bagging = ensemble.BaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42)
lev_bagging = ensemble.LeveragingBaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42)
arf = forest.ARFClassifier(seed = 42)
knn = (preprocessing.StandardScaler() | ensemble.BaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42))
hoeffdingadaptive = preprocessing.StandardScaler() | tree.HoeffdingAdaptiveTreeClassifier(seed=42)
hoeffding = preprocessing.StandardScaler() | tree.HoeffdingTreeClassifier()
SGD = (preprocessing.StandardScaler() | linear_model.LogisticRegression(optim.SGD(0.1)))
NB = preprocessing.StandardScaler() | naive_bayes.GaussianNB()

In [70]:
Datasets_river = {'Electricity' : ('elec_stream.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float, '7': float, '8': float}}),
                 'Electricity Tiny' : ('elec_stream_tiny.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float}})
                 }

Models_river = {'ADWIN bagging': adwin,
         'Adaptive Random Forest': arf,
         'Bagging': bagging,
         'Hoeffding Adaptive Tree': hoeffdingadaptive,
         'Hoeffding Tree': hoeffding,
         'Leveraging Bagging': lev_bagging,
         'Naive Bayes': NB,
         'k-Nearest Neighbors': knn,
         'SGDClassifier': SGD}

def get_performance(Models, Datasets):
    results = []
    for model_name in Models.keys():
        model = Models[model_name]
        for data_name in Datasets.keys():
            beginning = time.time()   #XXX - put the time measurements at the same place
            print(f"River : Model {model_name} and dataset {data_name}")
            dataset_csv, params = Datasets[data_name]
            metric_acc = metrics.Accuracy()
            metric_f1 = metrics.F1()
            
            for x, y in stream.iter_csv(dataset_csv, target = 'target', **params):
                model.learn_one(x, y)
                yp = model.predict_one(x)
                metric_acc.update(y, yp)
                metric_f1.update(y, yp)
            
            end = time.time()
            result_model = {'Model': model_name,
                    'Dataset': data_name,
                    'Time (seconds)': round(end - beginning, 4),
                    'Accuracy': metric_acc,
                    'F1': metric_f1}
            results.append(result_model)
    return results

results_river = get_performance(Models_river, Datasets_river)


River : Model ADWIN bagging and dataset Electricity
River : Model ADWIN bagging and dataset Electricity Tiny
River : Model Adaptive Random Forest and dataset Electricity
River : Model Adaptive Random Forest and dataset Electricity Tiny
River : Model Bagging and dataset Electricity
River : Model Bagging and dataset Electricity Tiny
River : Model Hoeffding Adaptive Tree and dataset Electricity
River : Model Hoeffding Adaptive Tree and dataset Electricity Tiny
River : Model Hoeffding Tree and dataset Electricity
River : Model Hoeffding Tree and dataset Electricity Tiny
River : Model Leveraging Bagging and dataset Electricity
River : Model Leveraging Bagging and dataset Electricity Tiny
River : Model Naive Bayes and dataset Electricity
River : Model Naive Bayes and dataset Electricity Tiny
River : Model k-Nearest Neighbors and dataset Electricity
River : Model k-Nearest Neighbors and dataset Electricity Tiny
River : Model SGDClassifier and dataset Electricity
River : Model SGDClassifier an

In [62]:
pd.DataFrame(results_river)

Unnamed: 0,Model,Dataset,Time (seconds),Accuracy,F1
0,ADWIN bagging,Electricity,3.4255,Accuracy: 0.00%,F1: 0.00%
1,Adaptive Random Forest,Electricity,26.2944,Accuracy: 95.48%,F1: 0.00%
2,Bagging,Electricity,2.3285,Accuracy: 0.00%,F1: 0.00%
3,Hoeffding Adaptive Tree,Electricity,4.067,Accuracy: 84.21%,F1: 0.00%
4,Hoeffding Tree,Electricity,1.7792,Accuracy: 80.91%,F1: 0.00%
5,Leveraging Bagging,Electricity,11.0919,Accuracy: 0.00%,F1: 0.00%
6,Naive Bayes,Electricity,3.2377,Accuracy: 76.18%,F1: 0.00%
7,k-Nearest Neighbors,Electricity,2.8272,Accuracy: 0.00%,F1: 0.00%
8,SGDClassifier,Electricity,0.7921,Accuracy: 0.00%,F1: 0.00%
