In [1]:
# General
import pandas as pd
import time

# CapyMOA 
from capymoa import datasets
from capymoa.classifier import AdaptiveRandomForestClassifier, HoeffdingAdaptiveTree, HoeffdingTree, LeveragingBagging, KNN, NaiveBayes, OnlineAdwinBagging, OnlineBagging, SGDClassifier
from capymoa.regressor import PassiveAggressiveRegressor, SGDRegressor, TargetMean, KNNRegressor, AdaptiveRandomForestRegressor
from capymoa.evaluation import prequential_evaluation

# River
from river import stream
from river import ensemble
from river import evaluate
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing
from river import naive_bayes
from river import tree
from river import forest
from river import dummy
from river import stats
from river import neighbors




## Part I - Binary Classifiers

The classifiers I found on both River and CapyMOA were : 
- ADWIN Bagging, (capymoa OnlineAdwinBagging)
- Adaptive Random Forest,
- Bagging, 
- Hoeffding Adaptive Tree, 
- Hoeffding Tree, 
- Leveraging Bagging, 
- Naive Bayes, 
- k-Nearest Neighbors, 
- sklearn SGDClassifier

We will compare both libraries on these classifiers.

The datasets we will use are from CapyMOA and are the following : 
- Electricity
- Electricity tiny

### I.1) Getting the data from CapyMOA 

In [3]:
elec_stream = datasets.Electricity()
variables = ['1', '2', '3', '4', '5', '6', '7','8' ] #XXX - get the real variables

def dataset_to_csv(stream, file_name, variables):
    data = []
    while stream.has_more_instances():
        inst = {}
        instance = stream.next_instance()
        x = instance.x
        try : 
            y = instance.y_index
        except:
            y = instance.y_value
            
        for i in range (len(x)) : 
            inst[variables[i]] = x[i]
        inst['target'] = y
        data.append(inst)
    
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False, sep = ',')

dataset_to_csv(elec_stream, 'elec_stream.csv', variables)


In [4]:
elec_stream_tiny = datasets.ElectricityTiny()
variables = ['1', '2', '3', '4', '5', '6', '7','8' ] #XXX - get the real variables

dataset_to_csv(elec_stream_tiny, 'elec_stream_tiny.csv', variables)


## I.2) Evaluating with CapyMOA

In [6]:
Datasets_capymoa = {'Electricity' : elec_stream, 'Electricity Tiny': elec_stream_tiny}
Models_capymoa = {'ADWIN bagging': OnlineAdwinBagging,
         'Adaptive Random Forest': AdaptiveRandomForestClassifier ,
         'Bagging': OnlineBagging,
         'Hoeffding Adaptive Tree': HoeffdingAdaptiveTree ,
         'Hoeffding Tree': HoeffdingTree,
         'Leveraging Bagging': LeveragingBagging,
         'Naive Bayes': NaiveBayes,
         'k-Nearest Neighbors': KNN,
         'SGDClassifier': SGDClassifier}
#metrics_river = {'F1' : metrics.F1(), 'Accuracy' : metrics.Accuracy()}

def get_performance_capymoa(Models, Datasets):
    results = []
    for model_name in Models.keys():
        model = Models[model_name]
        for data_name in Datasets.keys():
            stream = Datasets[data_name]
            print(f"CapyMOA : Model {model_name} and dataset {data_name}")
            result_model = dict()
            beginning = time.time()
            
            classifier = model(stream.get_schema())
            res = prequential_evaluation(stream, classifier)
            end = time.time()
            try :
                result_model = {'Model': model_name,
                    'Dataset': data_name,
                    'Time (seconds)': round(end - beginning, 4),
                    'Accuracy': res['cumulative'].accuracy(),
                    'F1': res['cumulative'].f1_score()}
            except:
                result_model = {'Model': model_name,
                    'Dataset': data_name,
                    'Time (seconds)': round(end - beginning, 4),
                    'RMSE': res['cumulative'].rmse()
                    #,'F1': res['cumulative'].f1_score() XXX - add more metrics
                               }
            results.append(result_model)
    return results

results_capymoa = get_performance_capymoa(Models_capymoa, Datasets_capymoa)


CapyMOA : Model ADWIN bagging and dataset Electricity
CapyMOA : Model ADWIN bagging and dataset Electricity Tiny
CapyMOA : Model Adaptive Random Forest and dataset Electricity
CapyMOA : Model Adaptive Random Forest and dataset Electricity Tiny
CapyMOA : Model Bagging and dataset Electricity
CapyMOA : Model Bagging and dataset Electricity Tiny
CapyMOA : Model Hoeffding Adaptive Tree and dataset Electricity
CapyMOA : Model Hoeffding Adaptive Tree and dataset Electricity Tiny
CapyMOA : Model Hoeffding Tree and dataset Electricity
CapyMOA : Model Hoeffding Tree and dataset Electricity Tiny
CapyMOA : Model Leveraging Bagging and dataset Electricity
CapyMOA : Model Leveraging Bagging and dataset Electricity Tiny
CapyMOA : Model Naive Bayes and dataset Electricity
CapyMOA : Model Naive Bayes and dataset Electricity Tiny
CapyMOA : Model k-Nearest Neighbors and dataset Electricity
CapyMOA : Model k-Nearest Neighbors and dataset Electricity Tiny
CapyMOA : Model SGDClassifier and dataset Electric

In [7]:
pd.DataFrame(results_capymoa)

Unnamed: 0,Model,Dataset,Time (seconds),Accuracy,F1
0,ADWIN bagging,Electricity,7.4626,84.939972,84.513691
1,ADWIN bagging,Electricity Tiny,0.4137,85.25,84.821727
2,Adaptive Random Forest,Electricity,45.3267,90.066649,89.807241
3,Adaptive Random Forest,Electricity Tiny,1.4211,89.0,88.643355
4,Bagging,Electricity,4.9993,82.825742,82.348786
5,Bagging,Electricity Tiny,0.2311,85.0,84.568468
6,Hoeffding Adaptive Tree,Electricity,0.2574,83.907133,83.458512
7,Hoeffding Adaptive Tree,Electricity Tiny,0.0101,84.15,83.320052
8,Hoeffding Tree,Electricity,0.1395,81.726695,81.253257
9,Hoeffding Tree,Electricity Tiny,0.0507,82.65,82.098119


XXX - Comment

### I.3) Evaluating with River

In [9]:
adwin = ensemble.ADWINBaggingClassifier(model=(preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,  seed=42)
bagging = ensemble.BaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42)
lev_bagging = ensemble.LeveragingBaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42)
arf = forest.ARFClassifier(seed = 42)
knn = (preprocessing.StandardScaler() | ensemble.BaggingClassifier( model=( preprocessing.StandardScaler() |linear_model.LogisticRegression()),n_models=3,seed=42))
hoeffdingadaptive = preprocessing.StandardScaler() | tree.HoeffdingAdaptiveTreeClassifier(seed=42)
hoeffding = preprocessing.StandardScaler() | tree.HoeffdingTreeClassifier()
SGD = (preprocessing.StandardScaler() | linear_model.LogisticRegression(optim.SGD(0.1)))
NB = preprocessing.StandardScaler() | naive_bayes.GaussianNB()


In [10]:
Datasets_river = {'Electricity' : ('elec_stream.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float, '7': float, '8': float}}),
                 'Electricity Tiny' : ('elec_stream_tiny.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float}})
                 }

Models_river = {'ADWIN bagging': adwin,
         'Adaptive Random Forest': arf,
         'Bagging': bagging,
         'Hoeffding Adaptive Tree': hoeffdingadaptive,
         'Hoeffding Tree': hoeffding,
         'Leveraging Bagging': lev_bagging,
         'Naive Bayes': NB,
         'k-Nearest Neighbors': knn,
         'SGDClassifier': SGD}

Metrics_river = {'F1' : metrics.F1(), 'Accuracy' : metrics.Accuracy()}

def get_performance_river(Models, Datasets, Metric):
    results = []
    for model_name in Models.keys():
        model = Models[model_name]
        for data_name in Datasets.keys():
            metrics = {name: metr.clone() for name, metr in Metric.items()}
            result_model={}
            beginning = time.time()   #XXX - put the time measurements at the same place
            print(f"River : Model {model_name} and dataset {data_name}")
            dataset_csv, params = Datasets[data_name]
            
            for x, y in stream.iter_csv(dataset_csv, target = 'target', **params):
                model.learn_one(x, y)
                yp = model.predict_one(x)
                for metric_name in Metric.keys():
                    metr = metrics[metric_name]
                    metr.update(y, yp)
            end = time.time()
            
            for metric_name in metrics.keys():
                metr = metrics[metric_name]
                if 'Model' not in result_model.keys():
                    result_model = {'Model': model_name,
                                    'Dataset': data_name,
                                    'Time (seconds)': round(end - beginning, 4),
                                    metric_name: metr}
                else :
                    result_model[metric_name] = metr
            results.append(result_model)
    return results

results_river = get_performance_river(Models_river, Datasets_river, Metrics_river)


River : Model ADWIN bagging and dataset Electricity
River : Model ADWIN bagging and dataset Electricity Tiny
River : Model Adaptive Random Forest and dataset Electricity
River : Model Adaptive Random Forest and dataset Electricity Tiny
River : Model Bagging and dataset Electricity
River : Model Bagging and dataset Electricity Tiny
River : Model Hoeffding Adaptive Tree and dataset Electricity
River : Model Hoeffding Adaptive Tree and dataset Electricity Tiny
River : Model Hoeffding Tree and dataset Electricity
River : Model Hoeffding Tree and dataset Electricity Tiny
River : Model Leveraging Bagging and dataset Electricity
River : Model Leveraging Bagging and dataset Electricity Tiny
River : Model Naive Bayes and dataset Electricity
River : Model Naive Bayes and dataset Electricity Tiny
River : Model k-Nearest Neighbors and dataset Electricity
River : Model k-Nearest Neighbors and dataset Electricity Tiny
River : Model SGDClassifier and dataset Electricity
River : Model SGDClassifier an

In [11]:
pd.DataFrame(results_river)

Unnamed: 0,Model,Dataset,Time (seconds),F1,Accuracy
0,ADWIN bagging,Electricity,3.1799,F1: 0.00%,Accuracy: 0.00%
1,ADWIN bagging,Electricity Tiny,0.1316,F1: 0.00%,Accuracy: 0.00%
2,Adaptive Random Forest,Electricity,26.0999,F1: 0.00%,Accuracy: 95.48%
3,Adaptive Random Forest,Electricity Tiny,0.9361,F1: 0.00%,Accuracy: 94.95%
4,Bagging,Electricity,2.5473,F1: 0.00%,Accuracy: 0.00%
5,Bagging,Electricity Tiny,0.0944,F1: 0.00%,Accuracy: 0.00%
6,Hoeffding Adaptive Tree,Electricity,4.1983,F1: 0.00%,Accuracy: 84.21%
7,Hoeffding Adaptive Tree,Electricity Tiny,0.1644,F1: 0.00%,Accuracy: 87.40%
8,Hoeffding Tree,Electricity,1.8528,F1: 0.00%,Accuracy: 80.91%
9,Hoeffding Tree,Electricity Tiny,0.0855,F1: 0.00%,Accuracy: 75.70%


### I.4) Comparisons of the performances

XXX - Comment 

## Part II - Multi_class Classifiers

The multi-class classifiers I found on both River and CapyMOA were : 
XXX 

We will compare both libraries on these classifiers.

The datasets we will use are from CapyMOA and are the following : 
XXX

### II.1) Getting the data from CapyMOA 

## Part III - Regression

The regressors I found on both River and CapyMOA were : 
- Passive-Aggressive Regressor
- Stochastic Gradient Tree
- [baseline] Mean predictor
- k-Nearest Neighbors
- Adaptive Random Forest
  
We will compare both libraries on these regressors.

The datasets we will use are from CapyMOA and are the following : 
XXX

### III.1) Getting the data from CapyMOA 

In [15]:
fried_stream = datasets.Fried()
variables = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] #XXX - get the real variables

dataset_to_csv(fried_stream, 'Fried.csv', variables)


In [16]:
bike_stream = datasets.Bike()
variables = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] #XXX - get the real variables

dataset_to_csv(bike_stream, 'Bike.csv', variables)


### III.2) Evaluating with CapyMOA

In [18]:
Datasets_capymoa = {'Fried' : fried_stream, 'Bike': bike_stream}
Models_capymoa = {'Passive-Aggressive Regressor': PassiveAggressiveRegressor,
         #'Stochastic Gradient Tree': XXX,
         '[baseline] Mean predictor': TargetMean,
         'k-Nearest Neighbors': KNNRegressor,
         'Adaptative Random Forest': AdaptiveRandomForestRegressor}

results_capymoa = get_performance_capymoa(Models_capymoa, Datasets_capymoa)


CapyMOA : Model Passive-Aggressive Regressor and dataset Fried




CapyMOA : Model Passive-Aggressive Regressor and dataset Bike




CapyMOA : Model [baseline] Mean predictor and dataset Fried
CapyMOA : Model [baseline] Mean predictor and dataset Bike
CapyMOA : Model k-Nearest Neighbors and dataset Fried
CapyMOA : Model k-Nearest Neighbors and dataset Bike
CapyMOA : Model Adaptative Random Forest and dataset Fried
CapyMOA : Model Adaptative Random Forest and dataset Bike


In [None]:
pd.DataFrame(results_capymoa)

XXX - Comment

### III.3) Evaluating with River

In [20]:
PA_reg_mode1 = preprocessing.StandardScaler() | linear_model.PARegressor(mode=1, C=1.0)
PA_reg_mode2 = preprocessing.StandardScaler() | linear_model.PARegressor(mode=2, C=1.0)
SGT_reg = preprocessing.StandardScaler() | tree.SGTRegressor(max_depth=5)
Mean_reg = preprocessing.StandardScaler() | dummy.StatisticRegressor(stats.Mean())
KNN_reg = preprocessing.StandardScaler() | neighbors.KNNRegressor()
ARF_reg = ARF_reg = (preprocessing.StandardScaler() | forest.ARFRegressor(seed=42))

In [21]:
Datasets_river = {'Fried' : ('Fried.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float, '7': float, '8': float, '9': float, '10': float, 'target':float}})
                  ,'Bike' : ('Bike.csv', {'converters': {'1': float, '2': float, '3': float, '4': float, '5': float, '6': float, '7': float, '8': float, '9': float, '10': float, '11': float, '12': float, 'target':float}})
                 }

Models_river = {'Passive-Aggressive Regressor, mode 1': PA_reg_mode1,
         'Passive-Aggressive Regressor, mode 2': PA_reg_mode2,
         'Stochastic Gradient Tree': SGT_reg,
         '[baseline] Mean predictor': Mean_reg,
         'k-Nearest Neighbors': KNN_reg,
         'Adaptative Random Forest': ARF_reg}

Metrics_river = {'MAE' : metrics.MAE(), 'RMSE' : metrics.RMSE(), 'R2' : metrics.R2()}

Regressors_results_river = get_performance_river(Models_river, Datasets_river, Metrics_river)


River : Model Passive-Aggressive Regressor, mode 1 and dataset Fried
River : Model Passive-Aggressive Regressor, mode 1 and dataset Bike
River : Model Passive-Aggressive Regressor, mode 2 and dataset Fried
River : Model Passive-Aggressive Regressor, mode 2 and dataset Bike
River : Model Stochastic Gradient Tree and dataset Fried
River : Model Stochastic Gradient Tree and dataset Bike
River : Model [baseline] Mean predictor and dataset Fried
River : Model [baseline] Mean predictor and dataset Bike
River : Model k-Nearest Neighbors and dataset Fried
River : Model k-Nearest Neighbors and dataset Bike
River : Model Adaptative Random Forest and dataset Fried
River : Model Adaptative Random Forest and dataset Bike


In [22]:
pd.DataFrame(Regressors_results_river)

Unnamed: 0,Model,Dataset,Time (seconds),F1,Accuracy
0,ADWIN bagging,Electricity,3.1799,F1: 0.00%,Accuracy: 0.00%
1,ADWIN bagging,Electricity Tiny,0.1316,F1: 0.00%,Accuracy: 0.00%
2,Adaptive Random Forest,Electricity,26.0999,F1: 0.00%,Accuracy: 95.48%
3,Adaptive Random Forest,Electricity Tiny,0.9361,F1: 0.00%,Accuracy: 94.95%
4,Bagging,Electricity,2.5473,F1: 0.00%,Accuracy: 0.00%
5,Bagging,Electricity Tiny,0.0944,F1: 0.00%,Accuracy: 0.00%
6,Hoeffding Adaptive Tree,Electricity,4.1983,F1: 0.00%,Accuracy: 84.21%
7,Hoeffding Adaptive Tree,Electricity Tiny,0.1644,F1: 0.00%,Accuracy: 87.40%
8,Hoeffding Tree,Electricity,1.8528,F1: 0.00%,Accuracy: 80.91%
9,Hoeffding Tree,Electricity Tiny,0.0855,F1: 0.00%,Accuracy: 75.70%
