## Load Data from kafka server 

In [None]:
%load_ext sql
%sql trino://streambased-server:8080/kafka
result = %sql SELECT * FROM kafka.streambased.recipes WHERE rating = 5 

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
Deploy Dash apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup


### Load the query into pandas DataFrame

In [19]:
import pandas as pd
df = result.DataFrame()

### Train the model 

In [20]:
import time
from joblib import dump, load
from river import tree, compose, metrics
import matplotlib.pyplot as plt

In [10]:
import pandas as pd
from joblib import dump, load
from river import tree, metrics

# Assuming df is your DataFrame after loading and preprocessing
features = [column for column in df.columns if column not in ['rating', '_partition_id', '_partition_offset',
                                                               '_message_corrupt', '_message', '_headers',
                                                               '_message_length', '_key_corrupt', '_key',
                                                               '_key_length', '_timestamp', 'cuisine']]

# Initialize or load the model
model_path = "hoeffding_tree_model.joblib"
try:
    model = load("hoeffding_tree_model.joblib")
    print("Model loaded successfully.")
except Exception:
    model = tree.HoeffdingTreeClassifier(
        grace_period=25,  # Reduce the grace period to make the tree adapt more quickly
        max_depth=30,  # Limit the maximum depth to control the complexity of the tree
        split_criterion='gini',  # Use Gini impurity as the split criterion
        leaf_prediction='nba',  # Use Naive Bayes Adaptive for leaf prediction
        nb_threshold=10,  # Increase the number of instances required for Naive Bayes
        binary_split=True,  # Allow only binary splits
        min_branch_fraction=0.005,  # Lower the minimum fraction of samples required for branches
        max_share_to_split=0.95,  # Allow splits even if the majority class share is high
        max_size=200  # Increase the maximum size of the tree
    )
    print("New model initialized.")

# Prepare a metric to track performance
metric = metrics.Accuracy()

total_trained_rows = 0
accuracy_values = []

# Simulate a data stream and train the model incrementally
for index, row in df.iterrows():
    x = row[features].to_dict()
    y = row['cuisine']
    model.learn_one(x, y)
    total_trained_rows += 1

print("Model Trained")
print("Total number of rows trained:", total_trained_rows)

# Save the model
try:
    dump(model, model_path)
    print("Model saved successfully.")
except Exception as e:
    print("Error saving the model:", e)


New model initialized.
Model Trained
Total number of rows trained: 9123
Model saved successfully.
