# Kaggle Cloud Ops: Queen Bee Acoustics + Makueni Apiary Intelligence

This unified notebook stitches together:

1. **Queen Bee acoustic detection (CNN + hyperparameter tuning)**
2. **Makueni Apiary intelligence workflows (weather, NDVI, telemetry, hive stress ML)**

> **Kaggle usage:** Attach the `harshkumar1711/beehive-audio-dataset-with-queen-and-without-queen` dataset plus any `content/main-data` exports as Kaggle data sources. All intermediate files are written under `content/` so the same notebook also works locally.

In [17]:
!pip install -q earthengine-api ipyleaflet ipywidgets keras-tuner librosa tqdm

In [18]:
import os
import shutil
from pathlib import Path

PROJECT_ROOT = Path.cwd()
DEFAULT_CONTENT = PROJECT_ROOT / "content"
KAGGLE_WORKING = Path("/kaggle/working")

if DEFAULT_CONTENT.exists():
    CONTENT_ROOT = DEFAULT_CONTENT.resolve()
else:
    CONTENT_ROOT = (KAGGLE_WORKING / "content").resolve()
    CONTENT_ROOT.mkdir(parents=True, exist_ok=True)

os.environ["MERGED_CONTENT_ROOT"] = str(CONTENT_ROOT)
MAIN_DATA_DIR = (CONTENT_ROOT / "main-data")
MAIN_DATA_DIR.mkdir(parents=True, exist_ok=True)

KAGGLE_INPUT_ROOT = Path("/kaggle/input")

def _stage_dataset(keyword, target_subdir):
    if not KAGGLE_INPUT_ROOT.exists():
        return None
    matches = [p for p in KAGGLE_INPUT_ROOT.iterdir() if keyword in p.name.lower()]
    if not matches:
        print(f"[setup] Kaggle input dataset containing '{keyword}' not found.")
        return None
    source = matches[0]
    target = CONTENT_ROOT / target_subdir
    shutil.rmtree(target, ignore_errors=True)
    shutil.copytree(source, target, dirs_exist_ok=True)
    print(f"[setup] Staged {source.name} -> {target}")
    return target

def _maybe_stage(keyword, subdir):
    try:
        _stage_dataset(keyword, subdir)
    except Exception as exc:
        print(f"[setup] Skipping auto-stage for {keyword}: {exc}")

_maybe_stage("beehive", "beehive_audio")
_maybe_stage("makueni", "main-data")

print(f"CONTENT_ROOT -> {CONTENT_ROOT}")


[setup] Staged beehive-audio-dataset-with-queen-and-without-queen -> /kaggle/working/content/beehive_audio
[setup] Kaggle input dataset containing 'makueni' not found.
CONTENT_ROOT -> /kaggle/working/content


In [19]:
import calendar
import datetime as dt
import gc
import io
import json
import math
import os
import time
import warnings
from pathlib import Path

import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import keras_tuner as kt

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_recall_curve,
    precision_score,
    recall_score,
    roc_auc_score,
    RocCurveDisplay
)
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import requests

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 4)

CONTENT_ROOT = Path(os.environ["MERGED_CONTENT_ROOT"])
MAIN_DATA_DIR = CONTENT_ROOT / "main-data"


2025-12-23 09:13:30.033031: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766481210.054537    4466 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766481210.061049    4466 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766481210.078566    4466 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766481210.078601    4466 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766481210.078603    4466 computation_placer.cc:177] computation placer alr

## Queen Bee Acoustic Detection Pipeline

In [20]:
from pathlib import Path

def _discover_audio_dataset(content_root: Path) -> Path:
    search_root = Path("/kaggle/input/beehive-audio-dataset-with-queen-and-without-queen")
    if not search_root.exists():
        raise FileNotFoundError(
            "Dataset not staged. Attach Kaggle dataset "
            "'harshkumar1711/beehive-audio-dataset-with-queen-and-without-queen'."
        )

    for candidate in sorted(search_root.rglob("Dataset")):
        if (candidate / "Bee Hive Audios").exists():
            return candidate

    raise FileNotFoundError("Could not locate 'Dataset/Bee Hive Audios'.")

# Discover dataset (READ-ONLY)
AUDIO_DATASET_ROOT = _discover_audio_dataset(None)

BEEHIVE_AUDIO_DIR = next(AUDIO_DATASET_ROOT.glob("**/Bee Hive Audios"))
QUEEN_PRESENT_DIR = BEEHIVE_AUDIO_DIR / "QueenBee Present"
QUEEN_ABSENT_DIR = BEEHIVE_AUDIO_DIR / "QueenBee Absent"
EXTERNAL_DIR = AUDIO_DATASET_ROOT / "External Noise"

# WRITEABLE spectrogram directory
SPECTROGRAM_DIR = Path("/kaggle/working/spectrograms")
SPECTROGRAM_PRESENT = SPECTROGRAM_DIR / "present"
SPECTROGRAM_ABSENT = SPECTROGRAM_DIR / "absent"
SPECTROGRAM_EXTERNAL = SPECTROGRAM_DIR / "external"

for path in [SPECTROGRAM_PRESENT, SPECTROGRAM_ABSENT, SPECTROGRAM_EXTERNAL]:
    path.mkdir(parents=True, exist_ok=True)

print("Audio dataset root (read-only):", AUDIO_DATASET_ROOT)
print("Spectrogram cache (writable):", SPECTROGRAM_DIR)


Audio dataset root (read-only): /kaggle/input/beehive-audio-dataset-with-queen-and-without-queen/Dataset
Spectrogram cache (writable): /kaggle/working/spectrograms


In [21]:
try:
    tpu_resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu_resolver)
    tf.tpu.experimental.initialize_tpu_system(tpu_resolver)
    strategy = tf.distribute.TPUStrategy(tpu_resolver)
    ACCELERATOR = "TPU"
except (ValueError, tf.errors.NotFoundError):
    gpus = tf.config.list_physical_devices("GPU")
    if gpus:
        for gpu in gpus:
            try:
                tf.config.experimental.set_memory_growth(gpu, True)
            except Exception:
                pass
        # Default to single-replica strategy for Kaggle GPU stability
        strategy = tf.distribute.get_strategy()
        ACCELERATOR = "GPU"
    else:
        strategy = tf.distribute.get_strategy()
        ACCELERATOR = "CPU"

print(f"Using {ACCELERATOR} via {strategy.__class__.__name__}")


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Using 2xGPU via MirroredStrategy


I0000 00:00:1766481238.825224    4466 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1766481238.829195    4466 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [22]:
SAMPLE_RATE = 22050
DURATION = 3
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

librosa.cache.clear()
plt.switch_backend("Agg")

def preprocess_and_save_spectrogram(audio_path: Path, output_image_path: Path, sr=SAMPLE_RATE, duration=DURATION):
    try:
        y, _ = librosa.load(audio_path, sr=sr)
        y, _ = librosa.effects.trim(y)
        y = librosa.to_mono(y) if y.ndim > 1 else y
        y = librosa.util.normalize(y)

        expected_samples = sr * duration
        if len(y) < expected_samples:
            y = np.pad(y, (0, expected_samples - len(y)), mode="constant")
        else:
            y = y[:expected_samples]

        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
        mel_db = librosa.power_to_db(mel_spec, ref=np.max)

        plt.figure(figsize=(2, 2), dpi=64)
        librosa.display.specshow(mel_db, sr=sr, cmap="magma")
        plt.axis("off")
        output_image_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(output_image_path, bbox_inches="tight", pad_inches=0)
        plt.close()
    except Exception as exc:
        print(f"[spectrogram] Failed on {audio_path}: {exc}")

def _compute_progress(files, output_dir: Path):
    total = len(files)
    processed = sum((output_dir / f"{Path(f).stem}.png").exists() for f in files)
    return total, processed

def process_audio_folder(input_dir: Path, output_dir: Path, desc: str):
    if not input_dir.exists():
        print(f"[spectrogram] {input_dir} missing, skipping {desc}.")
        return
    wav_files = sorted([f for f in input_dir.iterdir() if f.suffix.lower() == ".wav"])
    total, processed = _compute_progress([f.name for f in wav_files], output_dir)
    with tqdm(total=total, initial=processed, desc=desc, unit="file") as pbar:
        for wav_path in wav_files:
            out_path = output_dir / f"{wav_path.stem}.png"
            if out_path.exists():
                pbar.update(1)
                continue
            preprocess_and_save_spectrogram(wav_path, out_path)
            gc.collect()
            pbar.update(1)

def process_external_folder(input_dir: Path, output_dir: Path):
    if not input_dir.exists():
        print("[spectrogram] External noise folder missing, skipping.")
        return
    audio_paths = []
    for root, _, files in os.walk(input_dir):
        audio_paths += [Path(root) / f for f in files if f.lower().endswith(".wav")]
    with tqdm(total=len(audio_paths), desc="External noise", unit="file") as pbar:
        for wav_path in audio_paths:
            out_path = output_dir / f"{wav_path.stem}.png"
            if out_path.exists():
                pbar.update(1)
                continue
            preprocess_and_save_spectrogram(wav_path, out_path)
            pbar.update(1)


[Memory(location=None)]: Flushing completely the cache


In [23]:
process_audio_folder(QUEEN_PRESENT_DIR, SPECTROGRAM_PRESENT, "QueenBee Present")
process_audio_folder(QUEEN_ABSENT_DIR, SPECTROGRAM_ABSENT, "QueenBee Absent")
process_external_folder(EXTERNAL_DIR, SPECTROGRAM_EXTERNAL)

QueenBee Present: 8000file [00:00, 77675.89file/s]             
QueenBee Absent: 4000file [00:00, 76544.68file/s]             
External noise: 100%|██████████| 2000/2000 [00:00<00:00, 60670.07file/s]


In [24]:
def count_pngs(folder: Path):
    return len([f for f in folder.glob("*.png")])

class_labels = ["present", "absent", "external"]
counts = [
    count_pngs(SPECTROGRAM_PRESENT),
    count_pngs(SPECTROGRAM_ABSENT),
    count_pngs(SPECTROGRAM_EXTERNAL),
]

plt.figure(figsize=(6, 4))
bars = plt.bar(class_labels, counts, color=["sienna", "peru", "gray"], edgecolor="black")
plt.ylim(0, max(counts) * 1.1 if counts else 10)
plt.title("Spectrogram Count per Class")
plt.ylabel("Images")
for bar in bars:
    y = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, y + max(1, y ** 0.5), int(y), ha="center", va="bottom")
plt.show()

print(dict(zip(class_labels, counts)))


{'present': 4000, 'absent': 2000, 'external': 2000}


In [25]:
IMG_SIZE = (128, 128)
BASE_BATCH_SIZE = 32
BATCH_SIZE = BASE_BATCH_SIZE  # Keep per-device batch size stable on Kaggle
SEED = 42

spectro_records = []
for class_dir in sorted(SPECTROGRAM_DIR.iterdir()):
    if class_dir.is_dir():
        label = class_dir.name
        for img_path in class_dir.glob("*.png"):
            spectro_records.append({"filepath": str(img_path), "label": label})

if not spectro_records:
    raise RuntimeError("No spectrograms were generated; run preprocessing above first.")

spectro_df = pd.DataFrame(spectro_records)
CLASS_NAMES = sorted(spectro_df["label"].unique())

train_df, temp_df = train_test_split(
    spectro_df,
    test_size=0.4,
    stratify=spectro_df["label"],
    random_state=SEED
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=SEED
)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    width_shift_range=0.05,
    height_shift_range=0.05
)
eval_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_dataframe(
    train_df,
    x_col="filepath",
    y_col="label",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="sparse",
    classes=CLASS_NAMES,
    shuffle=True,
    seed=SEED
)

val_gen = eval_datagen.flow_from_dataframe(
    val_df,
    x_col="filepath",
    y_col="label",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="sparse",
    classes=CLASS_NAMES,
    shuffle=False,
    seed=SEED
)

test_gen = eval_datagen.flow_from_dataframe(
    test_df,
    x_col="filepath",
    y_col="label",
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="sparse",
    classes=CLASS_NAMES,
    shuffle=False,
    seed=SEED
)

raw_class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(CLASS_NAMES),
    y=train_df["label"]
)
CLASS_WEIGHTS = {
    train_gen.class_indices[label]: weight for label, weight in zip(CLASS_NAMES, raw_class_weights)
}
print("Class indices:", train_gen.class_indices)
print("Class weights:", CLASS_WEIGHTS)

ABSENT_CLASS_INDEX = train_gen.class_indices["absent"]

class SparseClassRecall(tf.keras.metrics.Metric):
    def __init__(self, class_id, name="sparse_class_recall", **kwargs):
        super().__init__(name=name, **kwargs)
        self.class_id = class_id
        self.true_positives = self.add_weight(name="tp", initializer="zeros")
        self.false_negatives = self.add_weight(name="fn", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(tf.reshape(y_true, [-1]), tf.int32)
        y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.int32)
        class_mask = tf.cast(tf.equal(y_true, self.class_id), self.dtype)
        pred_mask = tf.cast(tf.equal(y_pred, self.class_id), self.dtype)
        if sample_weight is None:
            weights = tf.ones_like(class_mask)
        else:
            weights = tf.cast(tf.reshape(sample_weight, [-1]), self.dtype)
            weights = tf.broadcast_to(weights, tf.shape(class_mask))
        weighted_mask = class_mask * weights
        tp = tf.reduce_sum(pred_mask * weighted_mask)
        fn = tf.reduce_sum((1.0 - pred_mask) * weighted_mask)
        self.true_positives.assign_add(tp)
        self.false_negatives.assign_add(fn)

    def get_config(self):
        config = super().get_config()
        config.update({"class_id": int(self.class_id)})
        return config

    def result(self):
        return tf.math.divide_no_nan(self.true_positives, self.true_positives + self.false_negatives)

    def reset_states(self):
        self.true_positives.assign(0.0)
        self.false_negatives.assign(0.0)

def make_absent_recall(name="recall_absent"):
    return SparseClassRecall(class_id=ABSENT_CLASS_INDEX, name=name)



Found 5600 images belonging to 3 classes.
Found 4000 images belonging to 3 classes.
Found 4000 images belonging to 3 classes.
Class indices: {'absent': 0, 'external': 1, 'present': 2}


In [26]:
from tensorflow.keras.callbacks import EarlyStopping

def build_baseline_model():
    model = models.Sequential([
        layers.Conv2D(32, 3, activation="relu", padding="same", input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.Conv2D(64, 3, activation="relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.Conv2D(128, 3, activation="relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.GlobalAveragePooling2D(),
        layers.Dense(64, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(3, activation="softmax"),
    ])
    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy", make_absent_recall()]
    )
    return model

with strategy.scope():
    baseline_model = build_baseline_model()

baseline_callbacks = [
    EarlyStopping(monitor="val_recall_absent", mode="max", patience=3, restore_best_weights=True)
]

baseline_history = baseline_model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=20,
    class_weight=CLASS_WEIGHTS,
    callbacks=baseline_callbacks
)


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

I0000 00:00:1766481304.711346    4582 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1766481306.260956    4584 cuda_dnn.cc:529] Loaded cuDNN version 91002


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.7643 - loss: 0.5874INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 138ms/step - accuracy: 0.7654 - loss: 0.5851 - val_accuracy: 0.5000 - val_loss: 1.1700
Epoch 2/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 132ms/step - accuracy: 0.9596 - loss: 0.1312 - val_accuracy: 0.5000 - val_loss: 3.2782
Epoch 3/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 132ms/step - accuracy: 0.9805 - loss: 0.0736 - val_accuracy: 0.5000 - val_loss: 5.1786
Epoch 4/10
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 137ms/step - accuracy: 0.9887 - loss: 0.0481 - val_accuracy

In [31]:
from tensorflow.keras.callbacks import EarlyStopping
from pathlib import Path

def build_tunable_model(hp):
    model = models.Sequential([
        layers.Conv2D(
            hp.Choice("conv1", [32, 64]), 3,
            activation="relu", padding="same",
            input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
        ),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.Conv2D(hp.Choice("conv2", [64, 128]), 3, activation="relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.Conv2D(hp.Choice("conv3", [128, 256]), 3, activation="relu", padding="same"),
        layers.BatchNormalization(),
        layers.MaxPooling2D(),

        layers.GlobalAveragePooling2D(),
        layers.Dense(hp.Int("dense_units", 64, 128, step=32), activation="relu"),
        layers.Dropout(hp.Float("dropout", 0.3, 0.6, step=0.1)),
        layers.Dense(3, activation="softmax"),
    ])

    model.compile(
        optimizer=hp.Choice("optimizer", ["adam", "nadam"]),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy", make_absent_recall()]
    )
    return model


# Strategy ONLY for tuner creation
with strategy.scope():
    tuner = kt.Hyperband(
        build_tunable_model,
        objective=kt.Objective("val_recall_absent", direction="max"),
        max_epochs=15,
        factor=3,
        directory="/kaggle/working/queenbee_tuning",
        project_name="queenbee_cnn"
    )

stopper = EarlyStopping(
    monitor="val_recall_absent",
    mode="max",
    patience=3,
    restore_best_weights=True
)

# Search OUTSIDE strategy scope
tuner.search(
    train_gen,
    validation_data=val_gen,
    epochs=15,
    class_weight=CLASS_WEIGHTS,
    callbacks=[stopper]
)

# NO strategy scope here
best_model = tuner.get_best_models(num_models=1)[0]

fine_tune_history = best_model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=15,
    class_weight=CLASS_WEIGHTS,
    callbacks=[stopper]
)

# Writable save path
best_model_path = Path("/kaggle/working/queenbee_final_tuned_model.keras")
best_model.save(best_model_path)

print("Saved tuned model to", best_model_path)


Trial 30 Complete [00h 00m 53s]
val_accuracy: 0.25

Best val_accuracy So Far: 0.9837499856948853
Total elapsed time: 00h 27m 03s
Found 6800 images belonging to 3 classes.
Found 1200 images belonging to 3 classes.
Epoch 1/10
[1m 20/107[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m7s[0m 83ms/step - accuracy: 0.9969 - loss: 0.0199

2025-12-23 10:27:54.525831: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-12-23 10:27:54.694532: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 160ms/step - accuracy: 0.9959 - loss: 0.0160 - val_accuracy: 0.9325 - val_loss: 0.2259
Epoch 2/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.9978 - loss: 0.0061 - val_accuracy: 0.6850 - val_loss: 3.3073
Epoch 3/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 94ms/step - accuracy: 0.9979 - loss: 0.0068 - val_accuracy: 0.5808 - val_loss: 2.2780
Epoch 4/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.9961 - loss: 0.0126 - val_accuracy: 0.2500 - val_loss: 14.4679
Epoch 5/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.9968 - loss: 0.0084 - val_accuracy: 0.7100 - val_loss: 0.7533
Epoch 6/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 93ms/step - accuracy: 0.9991 - loss: 0.0032 - val_accuracy: 0.7508 - val_loss: 1.3822
Epoch 7/10
[1m107/107[0



Saved tuned model to /kaggle/working/queenbee_final_tuned_model.h5


In [32]:
from tensorflow.keras.models import load_model

model_for_eval = load_model(
    best_model_path,
    custom_objects={"SparseClassRecall": SparseClassRecall}
)




In [33]:
def run_inference(model, generator):
    generator.reset()
    y_prob = model.predict(generator, verbose=1)
    y_true = generator.classes
    return y_prob, y_true


def derive_thresholds(y_true, y_prob, class_names):
    y_true_oh = tf.keras.utils.to_categorical(y_true, num_classes=len(class_names))
    thresholds = {}
    for idx, name in enumerate(class_names):
        precision, recall, thresh = precision_recall_curve(y_true_oh[:, idx], y_prob[:, idx])
        if thresh.size == 0:
            thresholds[name] = 0.5
            continue
        f1 = 2 * precision * recall / np.clip(precision + recall, 1e-8, None)
        best_idx = np.nanargmax(f1)
        thresholds[name] = float(thresh[min(best_idx, len(thresh) - 1)])
    return thresholds


def predict_with_thresholds(y_prob, class_names, thresholds):
    calibrated = []
    for row in y_prob:
        chosen_idx = None
        chosen_score = -1.0
        for idx, name in enumerate(class_names):
            threshold = thresholds.get(name, 0.5)
            if row[idx] >= threshold and row[idx] > chosen_score:
                chosen_idx = idx
                chosen_score = row[idx]
        if chosen_idx is None:
            chosen_idx = int(np.argmax(row))
        calibrated.append(chosen_idx)
    return np.array(calibrated)


def summarize_metrics(y_true, y_pred, label):
    return {
        "Mode": label,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Macro Precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "Macro Recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
        "Macro F1": f1_score(y_true, y_pred, average="macro", zero_division=0)
    }

val_prob, val_true = run_inference(model_for_eval, val_gen)
class_names = list(test_gen.class_indices.keys())
thresholds = derive_thresholds(val_true, val_prob, class_names)
print("Calibrated probability thresholds:")
for name in class_names:
    print(f"  {name}: {thresholds[name]:.3f}")


test_prob, test_true = run_inference(model_for_eval, test_gen)
default_pred = np.argmax(test_prob, axis=1)
calibrated_pred = predict_with_thresholds(test_prob, class_names, thresholds)

metrics_table = pd.DataFrame([
    summarize_metrics(test_true, default_pred, "Argmax"),
    summarize_metrics(test_true, calibrated_pred, "Calibrated")
])
display(metrics_table)

cm = confusion_matrix(test_true, calibrated_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix (Calibrated)")
plt.show()

print("Calibrated classification report:
", classification_report(test_true, calibrated_pred, target_names=class_names, zero_division=0))

roc_auc = roc_auc_score(
    pd.get_dummies(test_true, drop_first=False).values,
    test_prob,
    average="macro",
    multi_class="ovr"
)
pr_auc = average_precision_score(
    pd.get_dummies(test_true, drop_first=False).values,
    test_prob,
    average="macro"
)
print(f"ROC-AUC: {roc_auc:.4f} | PR-AUC: {pr_auc:.4f}")


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 67ms/step


Unnamed: 0,Metric,Score
0,Accuracy,0.7205
1,Macro Precision,0.880479
2,Macro Recall,0.627333
3,Macro F1,0.577196


              precision    recall  f1-score   support

      absent       1.00      0.01      0.02      1000
    external       1.00      0.87      0.93      1000
     present       0.64      1.00      0.78      2000

    accuracy                           0.72      4000
   macro avg       0.88      0.63      0.58      4000
weighted avg       0.82      0.72      0.63      4000

ROC-AUC: 0.9923 | PR-AUC: 0.9815


In [None]:
SR = 22050

def audio_to_spectrogram_image(audio_path: Path):
    y, sr = librosa.load(audio_path, sr=SR)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
    S_dB = librosa.power_to_db(S, ref=np.max)

    fig = plt.figure(figsize=(2, 2), dpi=64)
    librosa.display.specshow(S_dB, sr=sr, cmap="magma")
    plt.axis("off")

    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
    plt.close(fig)
    buf.seek(0)

    img = Image.open(buf).convert("RGB").resize(IMG_SIZE)
    img_array = np.array(img, dtype=np.float32) / 255.0
    img_array = np.expand_dims(img_array, axis=0)
    return img_array

def visualize_audio_prediction(audio_path: Path, model):
    mel_input = audio_to_spectrogram_image(audio_path)
    prediction = model.predict(mel_input)
    class_names = list(test_gen.class_indices.keys())
    pred_idx = int(np.argmax(prediction))
    confidence = float(np.max(prediction))

    y, sr = librosa.load(audio_path, sr=SR)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    times = np.linspace(0, len(y)/sr, len(y))
    axes[0,0].plot(times, y)
    axes[0,0].set_title("Waveform")

    img = axes[0,1].imshow(mel_db, aspect="auto", origin="lower", cmap="magma")
    axes[0,1].set_title("Mel Spectrogram")
    plt.colorbar(img, ax=axes[0,1], fraction=0.046, pad=0.04)

    axes[1,0].bar(class_names, prediction[0], color="teal")
    axes[1,0].set_ylim(0, 1)
    axes[1,0].set_title("Prediction Probabilities")

    axes[1,1].axis("off")
    axes[1,1].text(0.1, 0.5, f"Predicted: {class_names[pred_idx]}\nConfidence: {confidence:.2%}", fontsize=14)

    plt.tight_layout()
    plt.show()

    return {"prediction": class_names[pred_idx], "confidence": confidence}

# sample_audio = next(QUEEN_PRESENT_DIR.glob('*.wav'))
# visualize_audio_prediction(sample_audio, model_for_eval)


## Makueni Apiary Intelligence Pipeline

In [None]:
DEFAULT_CENTER = (-1.8048, 37.62)

try:
    import ipywidgets as widgets
    from ipyleaflet import Map, Marker, DrawControl, basemaps
except Exception:
    print("ipyleaflet not available; using default coordinates.")
    lat_widget = lon_widget = geometry_widget = None
else:
    lat_widget = widgets.FloatText(value=DEFAULT_CENTER[0], description="Latitude", step=0.0001)
    lon_widget = widgets.FloatText(value=DEFAULT_CENTER[1], description="Longitude", step=0.0001)
    geometry_widget = widgets.Textarea(
        value="",
        description="Geometry",
        placeholder="Draw a polygon/rectangle on the map.",
        layout=widgets.Layout(width="100%", height="140px"),
        disabled=True,
    )

    leaflet_map = Map(center=DEFAULT_CENTER, zoom=8, basemap=basemaps.OpenStreetMap.Mapnik, scroll_wheel_zoom=True)
    marker = Marker(location=DEFAULT_CENTER, draggable=True)
    leaflet_map.add_layer(marker)

    draw_control = DrawControl(
        polygon={"shapeOptions": {"color": "#2563eb", "weight": 2, "fillOpacity": 0.2}},
        rectangle={"shapeOptions": {"color": "#f97316", "weight": 2, "fillOpacity": 0.15}},
        circle={},
        circlemarker={},
        polyline={},
    )
    leaflet_map.add_control(draw_control)

    def _update_marker(change):
        marker.location = (lat_widget.value, lon_widget.value)

    lat_widget.observe(_update_marker, names="value")
    lon_widget.observe(_update_marker, names="value")

    display(widgets.HBox([lat_widget, lon_widget]))
    display(geometry_widget)
    display(leaflet_map)

lat_widget_available = 'lat_widget' in globals() and lat_widget is not None
lon_widget_available = 'lon_widget' in globals() and lon_widget is not None

if lat_widget_available and lon_widget_available:
    latitude = float(lat_widget.value)
    longitude = float(lon_widget.value)
else:
    latitude, longitude = DEFAULT_CENTER
    print("Using default coordinates:", DEFAULT_CENTER)

selected_geometry_geojson = globals().get('selected_geometry_geojson')


In [None]:
raw_start_date = "2008-01-01"
raw_end_date = "2025-12-05"
timezone = "Africa/Nairobi"

def normalize_date_string(d: str) -> dt.date:
    parts = d.split("-")
    if len(parts) != 3:
        raise ValueError("Date must be YYYY-MM-DD")
    y, m, day = [int(p) for p in parts]
    m = max(1, min(12, m))
    last_day = calendar.monthrange(y, m)[1]
    day = max(1, min(last_day, day))
    return dt.date(y, m, day)

start_date = normalize_date_string(raw_start_date)
end_date = normalize_date_string(raw_end_date)

today = dt.date.today()
api_latest = dt.date(2025, 12, 20)
max_allowed = min(today, api_latest)

if end_date > max_allowed:
    print(f"Clamping end_date {end_date} -> {max_allowed}")
    end_date = max_allowed
if start_date > end_date:
    raise ValueError("start_date must be before end_date")

print("Using date range:", start_date, "→", end_date)


In [None]:
ENABLE_REMOTE_CALLS = False  # Kaggle notebooks typically block outbound internet.

def split_date_range(start: dt.date, end: dt.date, max_days: int = 365):
    chunks = []
    current = start
    while current <= end:
        chunk_end = min(end, current + dt.timedelta(days=max_days - 1))
        chunks.append((current, chunk_end))
        current = chunk_end + dt.timedelta(days=1)
    return chunks

def fetch_chunk(lat, lon, sdate: dt.date, edate: dt.date, timezone="Africa/Nairobi", max_retries=3, backoff=2):
    base = "https://archive-api.open-meteo.com/v1/archive"
    daily_vars = ",".join([
        "temperature_2m_max",
        "temperature_2m_min",
        "temperature_2m_mean",
        "precipitation_sum",
        "relative_humidity_2m_mean",
        "wind_speed_10m_max",
        "cloudcover_mean"
    ])
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": sdate.strftime("%Y-%m-%d"),
        "end_date": edate.strftime("%Y-%m-%d"),
        "daily": daily_vars,
        "timezone": timezone
    }
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(base, params=params, timeout=30)
            resp.raise_for_status()
            payload = resp.json()
            if "daily" not in payload or "time" not in payload["daily"]:
                raise ValueError("API response missing expected fields.")
            return payload
        except Exception as exc:
            print(f"Attempt {attempt} failed: {exc}")
            if attempt == max_retries:
                raise
            time.sleep(backoff ** attempt)


In [None]:
weather_csv = MAIN_DATA_DIR / "makueni_weather_2008_2025.csv"
chunks = split_date_range(start_date, end_date, max_days=365)

if ENABLE_REMOTE_CALLS:
    dfs = []
    for s, e in chunks:
        payload = fetch_chunk(latitude, longitude, s, e, timezone=timezone)
        daily = payload["daily"]
        df_chunk = pd.DataFrame({
            "date": daily["time"],
            "temp_max": daily.get("temperature_2m_max"),
            "temp_min": daily.get("temperature_2m_min"),
            "temp_mean": daily.get("temperature_2m_mean"),
            "humidity_mean": daily.get("relative_humidity_2m_mean"),
            "rainfall_mm": daily.get("precipitation_sum"),
            "wind_speed_max": daily.get("wind_speed_10m_max"),
            "cloud_cover_percent": daily.get("cloudcover_mean"),
        })
        dfs.append(df_chunk)
        time.sleep(1)
    weather_df = pd.concat(dfs, ignore_index=True)
    weather_df["date"] = pd.to_datetime(weather_df["date"])
    weather_df.sort_values("date", inplace=True)
    weather_df.to_csv(weather_csv, index=False)
    print("Fetched and saved weather CSV to", weather_csv)
else:
    if weather_csv.exists():
        weather_df = pd.read_csv(weather_csv, parse_dates=["date"])
        print(f"Loaded cached weather data from {weather_csv}")
    else:
        raise FileNotFoundError(f"{weather_csv} not found; enable ENABLE_REMOTE_CALLS to regenerate.")


In [None]:
ndvi_csv = MAIN_DATA_DIR / "makueni_ndvi_2008_2025.csv"

if ENABLE_REMOTE_CALLS:
    try:
        ee.Initialize()
    except Exception:
        print("Authenticating with Earth Engine...")
        ee.Authenticate()
        ee.Initialize()

    point = ee.Geometry.Point([longitude, latitude])
    modis = ee.ImageCollection("MODIS/061/MOD13Q1").select("NDVI").filterDate(start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")).filterBounds(point)

    def extract_ndvi(image):
        mean = image.reduceRegion(reducer=ee.Reducer.mean(), geometry=point, scale=250).get("NDVI")
        date = image.date().format("YYYY-MM-dd")
        return ee.Feature(None, {"date": date, "ndvi_mean": mean})

    ndvi_fc = modis.map(extract_ndvi).getInfo()
    records = [f["properties"] for f in ndvi_fc["features"]]
    ndvi_df = pd.DataFrame(records)
    ndvi_df["date"] = pd.to_datetime(ndvi_df["date"])
    ndvi_df["ndvi_mean"] = ndvi_df["ndvi_mean"].astype(float) / 10000
    ndvi_df.to_csv(ndvi_csv, index=False)
    print("Fetched NDVI and saved to", ndvi_csv)
else:
    if ndvi_csv.exists():
        ndvi_df = pd.read_csv(ndvi_csv, parse_dates=["date"])
        print(f"Loaded cached NDVI data from {ndvi_csv}")
    else:
        raise FileNotFoundError(f"{ndvi_csv} not found; enable ENABLE_REMOTE_CALLS to regenerate.")


In [None]:
df_weather = weather_df.copy()
df_ndvi = ndvi_df.copy()

df_weather["date"] = pd.to_datetime(df_weather["date"])
df_ndvi["date"] = pd.to_datetime(df_ndvi["date"])

df_merged = pd.merge(df_weather, df_ndvi, on="date", how="left").sort_values("date")
weather_ndvi_path = MAIN_DATA_DIR / "makueni_weather_ndvi_2008_2025.csv"
df_merged.to_csv(weather_ndvi_path, index=False)
print("Merged weather+NDVI ->", weather_ndvi_path)
df_merged.head()


In [None]:
df_month = df_merged.set_index("date").resample("ME").agg({
    "rainfall_mm": "sum",
    "temp_mean": "mean",
    "humidity_mean": "mean",
    "ndvi_mean": "mean"
}).reset_index()

fig, axes = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axes[0].plot(df_month["date"], df_month["rainfall_mm"], marker="o")
axes[0].set_title("Monthly Rainfall (mm)")

axes[1].plot(df_month["date"], df_month["temp_mean"], marker="o", color="tomato")
axes[1].set_title("Monthly Mean Temperature (°C)")

axes[2].plot(df_month["date"], df_month["ndvi_mean"], marker="o", color="green")
axes[2].set_title("Monthly NDVI Mean")

for ax in axes:
    ax.grid(True, alpha=0.3)
    ax.set_ylabel("Value")

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
hive_logs_path = MAIN_DATA_DIR / "hive_logs_2008_2025.csv"

if hive_logs_path.exists():
    hive_df = pd.read_csv(hive_logs_path, parse_dates=["date"])
    print("Loaded hive logs from", hive_logs_path)
else:
    print("Generating synthetic hive telemetry...")
    start_dt = dt.datetime(2008, 1, 1)
    end_dt = dt.datetime(2025, 9, 30)
    dates = pd.date_range(start=start_dt, end=end_dt, freq="7D")

    hive_ids = ["Hive-A", "Hive-B", "Hive-C", "Hive-D"]
    data = []
    rng = np.random.default_rng(42)
    for hive in hive_ids:
        queen_age = rng.integers(3, 20)
        for date in dates:
            honey_yield = max(0, rng.normal(12, 3))
            varroa = np.clip(rng.normal(8, 3), 0, 40)
            hive_weight = rng.normal(45, 5)
            brood_area = np.clip(rng.normal(800, 150), 100, 1200)
            stress_event = rng.choice(["none", "ants", "drought"], p=[0.85, 0.1, 0.05])
            data.append({
                "date": date,
                "hive_id": hive,
                "honey_yield_kg": honey_yield,
                "varroa_pct": varroa,
                "hive_weight_kg": hive_weight,
                "brood_area_cm2": brood_area,
                "stress_event": stress_event,
                "queen_age_months": queen_age
            })
    hive_df = pd.DataFrame(data)
    hive_df.to_csv(hive_logs_path, index=False)
    print("Synthetic hive logs saved to", hive_logs_path)

hive_df.head()


In [None]:
weather_full = pd.read_csv(weather_ndvi_path, parse_dates=["date"])
hive_df["date"] = pd.to_datetime(hive_df["date"])
merged = pd.merge(hive_df, weather_full, on="date", how="left")
merged_path = MAIN_DATA_DIR / "merged_hive_weather_ndvi.csv"
merged.to_csv(merged_path, index=False)
print("Merged hive + weather ->", merged_path)
merged.head()


In [None]:
floral_data = {
    "date": [
        "2025-01-15","2025-02-15","2025-03-15","2025-04-15",
        "2025-05-15","2025-06-15","2025-07-15","2025-08-15","2025-09-15"
    ],
    "major_flowers": [
        "Acacia tortilis, Mango, Commiphora",
        "Acacia tortilis, Acacia mellifera, Mango",
        "Croton, Acacia mellifera",
        "Croton, Melia volkensii",
        "Citrus, Croton",
        "Aloe, Citrus",
        "Aloe, Pasture weeds",
        "Eucalyptus, Pasture weeds",
        "Eucalyptus camaldulensis"
    ],
    "nectar_flow_strength": ["High","High","Medium","Medium","Medium","Medium","Low","Low-Medium","High"],
    "stress_risk": ["Low","Low","Medium","Low","Medium","Medium","High","High","Low"],
    "pest_disease_notes": [
        "Hive beetles active",
        "Wax moth pressure",
        "Varroa buildup",
        "Chalkbrood risk",
        "Nosema risk",
        "Slow brood buildup",
        "Ant invasions",
        "Weak colony pests",
        "Healthy buildup"
    ]
}
floral_df = pd.DataFrame(floral_data)
floral_df["date"] = pd.to_datetime(floral_df["date"])
floral_path = MAIN_DATA_DIR / "makueni_floral_calendar_2025.csv"
floral_df.to_csv(floral_path, index=False)
print("Floral calendar saved to", floral_path)
floral_df


In [None]:
merged_full = merged.merge(floral_df, on="date", how="left")
merged_full_path = MAIN_DATA_DIR / "merged_hive_weather_floral_2025.csv"
merged_full.to_csv(merged_full_path, index=False)
print("Merged hive/weather/floral ->", merged_full_path)
merged_full.head()


In [None]:
model_df = merged_full.copy()
model_df["stress_event"] = model_df["stress_event"].fillna("none")
model_df["stress_target"] = (model_df["stress_event"] != "none").astype(int)
model_df["date"] = pd.to_datetime(model_df["date"])
model_df["month"] = model_df["date"].dt.month
model_df["year"] = model_df["date"].dt.year
model_df["weekofyear"] = model_df["date"].dt.isocalendar().week.astype(int)

rolling_features = ["honey_yield_kg", "varroa_pct", "hive_weight_kg", "brood_area_cm2"]
for feature in rolling_features:
    if feature in model_df.columns:
        model_df[f"{feature}_rolling_mean"] = (
            model_df.groupby("hive_id")[feature]
            .transform(lambda s: s.rolling(window=4, min_periods=1).mean())
        )
        model_df[f"{feature}_rolling_std"] = (
            model_df.groupby("hive_id")[feature]
            .transform(lambda s: s.rolling(window=4, min_periods=1).std())
        )

numeric_cols = model_df.select_dtypes(include=[np.number]).columns
exclude_cols = {"stress_target"}
feature_cols = [col for col in numeric_cols if col not in exclude_cols]
X = model_df[feature_cols].copy()
X = X.dropna(axis=1, how="all")
feature_cols = list(X.columns)

imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=feature_cols, index=model_df.index)
y = model_df["stress_target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

hb_model = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.08,
    max_iter=400,
    class_weight=class_weight_dict
)

hb_model.fit(X_train, y_train)
y_pred = hb_model.predict(X_test)
y_prob = hb_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

RocCurveDisplay.from_predictions(y_test, y_prob)
plt.show()


In [None]:
WINDOW_SIZE = 12
feature_columns = [col for col in feature_cols if col in model_df.columns]
sequence_features = model_df[feature_columns].fillna(model_df[feature_columns].median()).copy()
sequence_targets = model_df['stress_target'].values

X_sequences = []
y_sequences = []
metadata = []
for hive_id, group in model_df.groupby('hive_id'):
    group = group.sort_values('date')
    features = group[feature_columns].fillna(group[feature_columns].median()).values
    targets = group['stress_target'].values
    dates = group['date'].values
    if len(group) <= WINDOW_SIZE:
        continue
    for idx in range(WINDOW_SIZE, len(group)):
        window = features[idx-WINDOW_SIZE:idx]
        X_sequences.append(window)
        y_sequences.append(targets[idx])
        metadata.append({"hive_id": hive_id, "date": dates[idx]})

X_sequences = np.array(X_sequences, dtype=np.float32)
y_sequences = np.array(y_sequences, dtype=np.float32)
print(f'Total sequences: {X_sequences.shape[0]} | window shape: {X_sequences.shape[1:]}')

class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.X = torch.tensor(sequences, dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(
    X_sequences, y_sequences, test_size=0.2, random_state=42, stratify=y_sequences
)

train_dataset = SequenceDataset(X_train_seq, y_train_seq)
val_dataset = SequenceDataset(X_test_seq, y_test_seq)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

input_channels = X_sequences.shape[-1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class HiveCNN(nn.Module):
    def __init__(self, channels):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        x = x.transpose(1, 2)
        x = self.features(x)
        x = self.classifier(x)
        return x.squeeze(-1)

model = HiveCNN(input_channels).to(device)
pos_weight_value = float(max(1.0, (len(y_train_seq) - y_train_seq.sum()) / max(1.0, y_train_seq.sum())))
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight_value, device=device))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


def run_epoch(loader, train=True):
    model.train(train)
    total_loss = 0
    preds, targets = [], []
    for batch_X, batch_y in loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(train):
            logits = model(batch_X)
            loss = criterion(logits, batch_y)
            if train:
                loss.backward()
                optimizer.step()
        total_loss += loss.item() * batch_X.size(0)
        preds.append(torch.sigmoid(logits).detach().cpu().numpy())
        targets.append(batch_y.detach().cpu().numpy())
    preds = np.concatenate(preds)
    targets = np.concatenate(targets)
    return total_loss / len(loader.dataset), roc_auc_score(targets, preds)

epochs = 50
best_auc = 0
patience = 8
patience_counter = 0
model_path = MAIN_DATA_DIR / "hive_cnn_torch.pt"

for epoch in range(epochs):
    train_loss, train_auc = run_epoch(train_loader, train=True)
    val_loss, val_auc = run_epoch(val_loader, train=False)
    print(f"Epoch {epoch+1:02d}: train_loss={train_loss:.4f} AUC={train_auc:.3f} | val_loss={val_loss:.4f} AUC={val_auc:.3f}")
    if val_auc > best_auc + 1e-3:
        best_auc = val_auc
        patience_counter = 0
        torch.save(model.state_dict(), model_path)
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

with torch.no_grad():
    logits = []
    labels = []
    for batch_X, batch_y in val_loader:
        batch_X = batch_X.to(device)
        logits.append(model(batch_X).cpu())
        labels.append(batch_y)
    logits = torch.cat(logits)
    labels = torch.cat(labels)
    probs = torch.sigmoid(logits).numpy()
    labels_np = labels.numpy()

precision, recall, thresholds = precision_recall_curve(labels_np, probs)
f_scores = (2 * precision * recall) / np.clip(precision + recall, 1e-8, None)
best_idx = np.argmax(f_scores)
best_threshold = thresholds[max(best_idx - 1, 0)] if best_idx < len(thresholds) else 0.5
preds = (probs >= best_threshold).astype(int)

print(f"Best threshold based on F1: {best_threshold:.3f}")
print(classification_report(labels_np, preds))
print('Test ROC-AUC:', roc_auc_score(labels_np, probs))

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
axes[0].plot(recall, precision)
axes[0].set_title('Precision-Recall')
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
RocCurveDisplay.from_predictions(labels_np, probs, ax=axes[1])
axes[1].set_title('ROC Curve')
plt.tight_layout()
plt.show()

print('Best model weights saved to', model_path)
