In [16]:
import pandas as pd
from skmultiflow.data import FileStream
from skmultiflow.trees import HoeffdingTree
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.evaluation import EvaluatePrequential

Read Data

In [4]:
csv_file_path = 'stream_data.csv'
stream = FileStream(csv_file_path)
stream.prepare_for_use()

New instances of the Stream class are now ready to use after instantiation.


Get Size of Data

In [10]:
df = pd.read_csv(csv_file_path)
num_rows = len(df)
print(f"The CSV file contains {num_rows} rows.")

The CSV file contains 4000001 rows.


Check the Read Data Stream

In [5]:
print("First few samples from the stream:")
for i in range(5):
    X, y = stream.next_sample()
    print(f"Sample {i+1}: X={X}, y={y}")

First few samples from the stream:
Sample 1: X=[[2.40710035 2.39048173 2.03172447]], y=[0]
Sample 2: X=[[8.77528774 3.22438503 5.71692091]], y=[1]
Sample 3: X=[[3.71596923 5.47447819 9.84467718]], y=[1]
Sample 4: X=[[3.64947949 8.91395958 3.02884623]], y=[1]
Sample 5: X=[[4.20265406 0.89488276 3.48123019]], y=[0]


Define Models

In [6]:
ht = HoeffdingTree()
arf = AdaptiveRandomForestClassifier()

The old name will be removed in v0.7.0


Define Sample Sizes

In [20]:
pretrain_size = 2000000
max_samples = 4000000

Train & Evaluation Using HT

In [21]:
evaluator_ht = EvaluatePrequential(show_plot=False,
                                   pretrain_size=pretrain_size,
                                   max_samples=max_samples,
                                   metrics=['accuracy', 'kappa'])

In [22]:
evaluator_ht.evaluate(stream=stream, model=ht, model_names=['Hoeffding Tree'])

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000000 sample(s).
Evaluating...
 #################### [100%] [187.02s]
Processed samples: 4000000
Mean performance:
Hoeffding Tree - Accuracy     : 0.9993
Hoeffding Tree - Kappa        : 0.9984


[HoeffdingTreeClassifier(binary_split=False, grace_period=200,
                         leaf_prediction='nba', max_byte_size=33554432,
                         memory_estimate_period=1000000, nb_threshold=0,
                         no_preprune=False, nominal_attributes=None,
                         remove_poor_atts=False, split_confidence=1e-07,
                         split_criterion='info_gain', stop_mem_management=False,
                         tie_threshold=0.05)]

Restart the Stream 

In [23]:
stream.restart()

Train & Evaluation Using ARF

In [24]:
evaluator_arf = EvaluatePrequential(show_plot=False,
                                    pretrain_size=pretrain_size,
                                    max_samples=max_samples,
                                    metrics=['accuracy', 'kappa'])

In [25]:
evaluator_arf.evaluate(stream=stream, model=arf, model_names=['Adaptive Random Forest'])

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 2000000 sample(s).
Evaluating...
 #################### [100%] [11616.78s]
Processed samples: 4000000
Mean performance:
Adaptive Random Forest - Accuracy     : 0.9982
Adaptive Random Forest - Kappa        : 0.9959


[AdaptiveRandomForestClassifier(binary_split=False, disable_weighted_vote=False,
                                drift_detection_method=ADWIN(delta=0.001),
                                grace_period=50, lambda_value=6,
                                leaf_prediction='nba', max_byte_size=33554432,
                                max_features=2, memory_estimate_period=2000000,
                                n_estimators=10, nb_threshold=0,
                                no_preprune=False, nominal_attributes=None,
                                performance_metric='acc', random_state=None,
                                remove_poor_atts=False, split_confidence=0.01,
                                split_criterion='info_gain',
                                stop_mem_management=False, tie_threshold=0.05,