In [7]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import cfo_utils
from pathlib import Path
from scipy.signal import resample_poly

In [8]:
# SESSION_NAME = "2025-08-29_1"
# SESSION_NAME = "2025-08-29_2"
# SESSION_NAME = "2025-08-29_3"
# SESSION_NAME = "2025-08-29_4"

# SESSION_NAME = "2025-08-31_1"
# SESSION_NAME = "2025-08-31_2"
# SESSION_NAME = "2025-08-31_3"
# SESSION_NAME = "2025-08-31_4"
# SESSION_NAME = "2025-08-31_5"

SESSION_NAME = "train_1"
# SESSION_NAME = "train_2"
# SESSION_NAME = "train_3"

PATH = Path(f"/home/smazokha/Desktop/probe_captures_{SESSION_NAME}/")
PATH_OUT_1 = Path(f"/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_{SESSION_NAME}_01_raw.h5")
PATH_OUT_2 = Path(f"/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_{SESSION_NAME}_02_raw.h5")

# file_1_idx = 50
# file_2_idx = 100

file_1_idx = 1000
file_2_idx = 1000

DEVICES = [
    ("alfa_01", 1.),
    ("alfa_03", 3.),
    ("alfa_04", 4.),
    ("alfa_05", 5.),
    ("alfa_06", 6.),
    ("alfa_07", 7.),
    ("alfa_08", 8.),
    ("alfa_09", 9.),
    ("alfa_10", 10.),
    ("alfa_11", 11.),
    ("alfa_12", 12.),
    ("alfa_13", 13.)]

# Suppose arr has shape (2001, 320), dtype=complex64
# 20 Msps -> 25 Msps means upsample by 25, downsample by 20
UPSAMPLE_FACTOR = 25
DOWNSAMPLE_FACTOR = 20
EPS = 1e-12 # guard for zero/near-zero power|

In [9]:
def plot_fft(frame, fs_hz):
    """
    Plot FFT magnitude of a single complex IQ frame.

    frame : 1D numpy array of complex samples
    fs_hz : sampling rate in Hz
    """
    N = len(frame)
    spectrum = np.fft.fftshift(np.fft.fft(frame))
    freqs = np.fft.fftshift(np.fft.fftfreq(N, d=1.0/fs_hz))
    mag_db = 20 * np.log10(np.abs(spectrum) + 1e-12)

    plt.figure(figsize=(10, 5))
    plt.plot(freqs, mag_db)
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Magnitude (dB)")
    plt.title("FFT of frame")
    plt.grid(True)
    plt.show()

def read_file(device_name, upsample_on=False, normalize_on=False, compensate_cfo=False, demo_fft=False):
    with h5py.File(PATH / f"{device_name}.h5", "r") as f:
        print(f" * Normalization: {normalize_on}")
        print(f" * Upsampling: {upsample_on}")
        print(f" * CFO comp.: {compensate_cfo}")

        # Retrieve data from the file
        iq = np.array(f["iq"])
        rssi = np.array(f["rssi_dbm"])

        # Reformat the RSSI values (type and shape)
        rssi = rssi.astype(np.float64).reshape(-1, 1)

        # Extract I and Q values separately (shape N_frames Ã— N_samples)
        iq_i = iq[:, :, 0].astype(np.float64)
        iq_q = iq[:, :, 1].astype(np.float64)

        # Create complex values
        iq_comp = iq_i + 1j * iq_q

        # Since our IQ values are integer-coded (int16), we must divide them by 32768
        int_scale = 1 / 32768
        iq_comp = iq_comp * int_scale

        # Resample each frame to 25 Msps
        if upsample_on:
            _, n_samples = iq_comp.shape
            out_len = int(n_samples * UPSAMPLE_FACTOR / DOWNSAMPLE_FACTOR)
            iq_comp_resampled = np.zeros((iq_comp.shape[0], out_len), dtype=np.complex128)

            for packet_i in range(iq_comp.shape[0]):
                frame = resample_poly(
                    iq_comp[packet_i, :],
                    up=UPSAMPLE_FACTOR,
                    down=DOWNSAMPLE_FACTOR
                )

                iq_comp_resampled[packet_i, :] = frame

            iq_comp = iq_comp_resampled

        # Normalize frames to unit average power
        if normalize_on:
            iq_comp_normalized = np.zeros(iq_comp.shape, dtype=np.complex128)

            for packet_i in range(iq_comp.shape[0]):
                frame = iq_comp[packet_i, :]

                mean_power = np.mean(np.abs(frame) ** 2)
                if mean_power > 0:
                    frame = frame / np.sqrt(mean_power)

                iq_comp_normalized[packet_i, :] = frame

            iq_comp = iq_comp_normalized

        # Compensate CFO
        if compensate_cfo:
            fs = 25_000_000 if upsample_on else 20_000_000
            cfo_parts = cfo_utils.extract_data_cfo(iq_comp, fs_in=fs)
            iq_comp = cfo_utils.compensate_cfo(iq_comp, cfo_parts, fs=fs)

        # Quick FFT plot on first frame (20 Msps data)
        if demo_fft:
            fs = 25_000_000 if upsample_on else 20_000_000
            plot_fft(frame=iq_comp[0, :], fs_hz=fs)

        # Convert into interleaved float64 (real, imag)
        iq_out = np.empty((iq_comp.shape[0], iq_comp.shape[1] * 2), dtype=np.float64)
        iq_out[:, 0::2] = np.real(iq_comp)
        iq_out[:, 1::2] = np.imag(iq_comp)

        return [iq_out, rssi]
    
def write_file(path_out, iq, label, rssi):
    with h5py.File(path_out, "w") as f:
        f.create_dataset("data", data=iq)
        f.create_dataset("label", data=label)
        f.create_dataset("rssi", data=rssi)

iqs_1 = []
rssis_1 = []
labels_1 = []

iqs_2 = []
rssis_2 = []
labels_2 = []

# Run the merging process for each device (extract IQ, RSSI, Labels)
for device in DEVICES:
    print(f"Processing {device}:")
    device_label = device[0]
    device_id = device[1]
    
    iq, rssi = read_file(device_label, upsample_on=True, normalize_on=False, compensate_cfo=False, demo_fft=False)
    label = np.full(iq.shape[0], device_id).reshape(-1, 1)
    
    iqs_1.append(iq[0:file_1_idx, :])
    rssis_1.append(rssi[0:file_1_idx, :])
    labels_1.append(label[0:file_1_idx, :])
    
    iqs_2.append(iq[file_1_idx:file_2_idx, :])
    rssis_2.append(rssi[file_1_idx:file_2_idx, :])
    labels_2.append(label[file_1_idx:file_2_idx, :])
    
# Merge it all into one set of objects
iq_merged_1 = np.concatenate(iqs_1, axis=0)
rssi_merged_1 = np.concatenate(rssis_1, axis=0)
label_merged_1 = np.concatenate(labels_1, axis=0)

iq_merged_2 = np.concatenate(iqs_2, axis=0)
rssi_merged_2 = np.concatenate(rssis_2, axis=0)
label_merged_2 = np.concatenate(labels_2, axis=0)

# Write to H5 file
write_file(PATH_OUT_1, iq_merged_1, label_merged_1, rssi_merged_1)
write_file(PATH_OUT_2, iq_merged_2, label_merged_2, rssi_merged_2)

Processing ('alfa_01', 1.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_03', 3.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_04', 4.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_05', 5.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_06', 6.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_07', 7.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_08', 8.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_09', 9.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_10', 10.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_11', 11.0):
 * Normalization: False
 * Upsampling: True
 * CFO comp.: False
Processing ('alfa_12', 12.0):
 * Normalization: False
 * Upsampling:

In [12]:
def keep_k_per_label(iq: np.ndarray, label: np.ndarray, rssi: np.ndarray, k: int):
    """
    Keep at most k rows per label, preserving original order.
    Works with label shaped (M,) or (M,1). Returns filtered (iq, label, rssi).
    """
    lbl = np.ravel(label)  # (M,)
    assert iq.shape[0] == lbl.shape[0] == rssi.shape[0], "Mismatched lengths"

    counts = {}
    mask = np.zeros(lbl.shape[0], dtype=bool)

    for i, lab in enumerate(lbl):
        c = counts.get(lab, 0)
        if c < k:
            mask[i] = True
            counts[lab] = c + 1

    return iq[mask], label[mask], rssi[mask]

train_iq_1, train_label_1, train_rssi_1 = None, None, None
train_iq_2, train_label_2, train_rssi_2 = None, None, None
train_iq_3, train_label_3, train_rssi_3 = None, None, None

with h5py.File("/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_train_1_01_raw.h5", "r") as f:
    train_iq_1 = np.array(f['data'])
    train_label_1 = np.array(f['label'])
    train_rssi_1 = np.array(f['rssi'])

    train_iq_1, train_label_1, train_rssi_1 = keep_k_per_label(train_iq_1, train_label_1, train_rssi_1, k = 500)

with h5py.File("/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_train_2_01_raw.h5", "r") as f:
    train_iq_2 = np.array(f['data'])
    train_label_2 = np.array(f['label'])
    train_rssi_2 = np.array(f['rssi'])

    train_iq_2, train_label_2, train_rssi_2 = keep_k_per_label(train_iq_2, train_label_2, train_rssi_2, k = 500)

# with h5py.File("/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_train_03.h5", "r") as f:
#     train_iq_3 = np.array(f['data'])
#     train_label_3 = np.array(f['label'])
#     train_rssi_3 = np.array(f['rssi'])

#     train_iq_3, train_label_3, train_rssi_3 = keep_k_per_label(train_iq_3, train_label_3, train_rssi_3, k = 500)

train_iq_merged = np.concatenate([train_iq_1, train_iq_2], axis=0)
train_label_merged = np.concatenate([train_label_1, train_label_2], axis=0)
train_rssi_merged = np.concatenate([train_rssi_1, train_rssi_2], axis=0)

print(train_iq_merged.shape)

write_file(Path("/home/smazokha/Downloads/home_dataset_alfa/node1-1_epoch_train_merged_raw.h5"), train_iq_merged, train_label_merged, train_rssi_merged)

(12000, 8190)
