# HARP Register Deserialization Notebook
This notebook reads `.bin` register data from the [Harp behavior device](https://github.com/harp-tech/device.behavior), parses it using  the [`harp-python`](https://github.com/harp-tech/harp-python) library, and creates timestamp-aligned dataframes. Individual and merged data are saved to disk.

### 1. Import libraries, set data path and load data into a harp reader

In [1]:
import os
import re
import glob
import pandas as pd
import harp
import matplotlib.pyplot as plt

data_folder = "/Volumes/mrsic_flogel/public/projects/JeKr_130524_Joystick/rotary-encoder/bonsai_joystick_data/ML_166/allsessions/ses-001_date-20250918T161338/Behavior"
# "Z:/public/projects/JeKr_130524_Joystick/rotary-encoder/bonsai_joystick_data/JK_003/ses-001_date-20250917T104435/Behavior"

data_path = os.path.abspath(data_folder)
deserialised_folder = os.path.join(data_path, r"../Deserialised")
os.makedirs(deserialised_folder, exist_ok=True)

reader = harp.create_reader(data_path)
print("Loaded device from:", data_folder)

bin_files = glob.glob(os.path.join(data_path, "Behavior_*.bin"))
print(f"Found {len(bin_files)} registers:")
for f in bin_files:
    print("  ‚Ä¢", os.path.basename(f))
    

Loaded device from: /Volumes/mrsic_flogel/public/projects/JeKr_130524_Joystick/rotary-encoder/bonsai_joystick_data/ML_166/allsessions/ses-001_date-20250918T161338/Behavior
Found 5 registers:
  ‚Ä¢ Behavior_842025-09-18T17_13_41.bin
  ‚Ä¢ Behavior_82025-09-18T17_13_42.bin
  ‚Ä¢ Behavior_442025-09-18T17_13_41.bin
  ‚Ä¢ Behavior_922025-09-18T17_13_41.bin
  ‚Ä¢ Behavior_342025-09-18T17_13_48.bin


### 2. Load each registry's binary file and parse to a dictionary of dataframes

First let's build a lookup table to easily read the name of each numbered register

In [2]:
addr_to_name = {
    reg.register.address: name
    for name, reg in reader.registers.items()
}


Now we can read each binary into a dataframe, and give them their correct names.

1. Loop through list of files
2. Find the register name from the LUT above
3. Read the register bits, and convert to a dataframe
4. Save each dataframe to a `.csv` (option to save as .pkl file commented out)
5. Add each dataframe as an entry to a dictionary

In [4]:
'''
import os
import pandas as pd
from datetime import datetime

all_dataframes = {}

# --- Sort bin files by the timestamp in their name ---
def extract_timestamp(filepath):
    """Extract datetime from filename like Behavior_922025-10-15T17_34_13"""
    try:
        filename = os.path.basename(filepath)
        timestamp_str = filename.split('-')[-1].replace('.bin', '')
        return datetime.strptime(timestamp_str, "%Y-%m-%dT%H_%M_%S")
    except Exception:
        return datetime.min  # fallback if format unexpected

bin_files_sorted = sorted(bin_files, key=extract_timestamp)

# --- Process each .bin file in order ---
for filepath in bin_files_sorted:
    filename = os.path.basename(filepath)
    parts = filename.split('-')

    # Extract register address
    reg_addr = int(parts[0][9:-4])
    register = reader.registers.get(reg_addr)
    df = register.read(filepath)

    if df.empty:
        continue

    # Register name
    reg_name = addr_to_name.get(reg_addr, f"Register_{reg_addr}")
    df.columns = [f"{reg_name}_{col}" for col in df.columns]

    # Store in memory
    all_dataframes.setdefault(reg_name, []).append(df)

    print(f"‚úÖ Loaded {reg_name} from {filename}")

# --- Save each register once, in order ---
for reg_name, dfs in all_dataframes.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    out_csv_path = os.path.join(deserialised_folder, f"{reg_name}.csv")
    combined_df.to_csv(out_csv_path, index=False)
    print(f"üíæ Saved combined {reg_name} ({len(combined_df)} rows) to {out_csv_path}")

'''

'\nimport os\nimport pandas as pd\nfrom datetime import datetime\n\nall_dataframes = {}\n\n# --- Sort bin files by the timestamp in their name ---\ndef extract_timestamp(filepath):\n    """Extract datetime from filename like Behavior_922025-10-15T17_34_13"""\n    try:\n        filename = os.path.basename(filepath)\n        timestamp_str = filename.split(\'-\')[-1].replace(\'.bin\', \'\')\n        return datetime.strptime(timestamp_str, "%Y-%m-%dT%H_%M_%S")\n    except Exception:\n        return datetime.min  # fallback if format unexpected\n\nbin_files_sorted = sorted(bin_files, key=extract_timestamp)\n\n# --- Process each .bin file in order ---\nfor filepath in bin_files_sorted:\n    filename = os.path.basename(filepath)\n    parts = filename.split(\'-\')\n\n    # Extract register address\n    reg_addr = int(parts[0][9:-4])\n    register = reader.registers.get(reg_addr)\n    df = register.read(filepath)\n\n    if df.empty:\n        continue\n\n    # Register name\n    reg_name = addr_

In [None]:
from collections import defaultdict
import pandas as pd
import os

# --- Group bin files by register address ---
bins_by_register = defaultdict(list)
for filepath in bin_files:
    filename = os.path.basename(filepath)
    parts = filename.split('-')
    reg_addr = int(parts[0][9:-4])
    bins_by_register[reg_addr].append(filepath)

all_dataframes = {}

# --- Loop through each register's group ---
for reg_addr, files in bins_by_register.items():
    # Sort by datetime extracted from filename (assuming second part is timestamp)
    files.sort(key=lambda f: f.split('-')[1])

    register = reader.registers.get(reg_addr)
    reg_name = addr_to_name.get(reg_addr, f"Register_{reg_addr}")

    dfs = []
    for f in files:
        df = register.read(f)
        if df.empty:
            continue

        # ‚úÖ Only reset index for the OutputSet register
        if "OutputSet" in reg_name and df.index.name is not None:
            df = df.reset_index()

        # ‚úÖ Always append dataframe (for all registers)
        dfs.append(df)
        print(f"‚úÖ Read {reg_name} from {os.path.basename(f)} ({len(df)} rows)")

    if not dfs:
        print(f"‚ö†Ô∏è No valid data found for {reg_name}")
        continue

    # ‚úÖ Concatenate all bins for this register
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.columns = [f"{reg_name}_{col}" for col in combined_df.columns]

    out_csv_path = os.path.join(deserialised_folder, f"{reg_name}.csv")
    combined_df.to_csv(out_csv_path, index=False)

    all_dataframes[reg_name] = combined_df
    print(f"üíæ Saved combined {reg_name} with {len(combined_df)} total rows\n")


AttributeError: 'NoneType' object has no attribute 'read'

## Troubleshooting - Checking the solenoid events are Timestamped

In [32]:
# Inspect the OutputSet dataframe structure
outputset_addr = [addr for addr, name in addr_to_name.items() if "OutputSet" in name][0]
df = reader.registers[outputset_addr].read(bin_files[0])
print(df.head(30))
print(df.columns)

print(f"Saving {reg_name}: columns = {df.columns.tolist()}")



ValueError: expected address 34 but got 84

In [None]:
'''
df = df.reset_index()  # moves 'Time' index into a column
print(df.columns)
'''

"\ndf = df.reset_index()  # moves 'Time' index into a column\nprint(df.columns)\n"

### Loading Harp dfs

In [None]:
'''
all_dataframes = {}

# Load each register's binary file in a loop
for filepath in bin_files:
    # Grab registry address
    filename = os.path.basename(filepath)
    parts = filename.split('-')
    
    reg_addr = int(parts[0][9:-4]) # after behavior and before YYYY
    register = reader.registers.get(reg_addr)
        
    # Read this register    
    df = register.read(filepath)
    if df.empty:
        continue

    # Grab this register's name
    reg_name = addr_to_name.get(reg_addr, f"Register_{reg_addr}")
    df.columns = [f"{reg_name}_{col}" for col in df.columns]

    # Define save path
    out_csv_path = os.path.join(deserialised_folder, f"{reg_name}.csv")
    df.to_csv(out_csv_path, index=False)

    # OR Save as .pkl
    # pkl_path = os.path.join(deserialised_folder, f"{reg_name}.pkl")
    # df.to_pickle(pkl_path)

    all_dataframes[reg_name] = df
    print(f"Saved {reg_name} to {out_csv_path}")
    # print(f"Saved {reg_name} to {pkl_path}")
    '''
  

'\nall_dataframes = {}\n\n# Load each register\'s binary file in a loop\nfor filepath in bin_files:\n    # Grab registry address\n    filename = os.path.basename(filepath)\n    parts = filename.split(\'-\')\n\n    reg_addr = int(parts[0][9:-4]) # after behavior and before YYYY\n    register = reader.registers.get(reg_addr)\n\n    # Read this register    \n    df = register.read(filepath)\n    if df.empty:\n        continue\n\n    # Grab this register\'s name\n    reg_name = addr_to_name.get(reg_addr, f"Register_{reg_addr}")\n    df.columns = [f"{reg_name}_{col}" for col in df.columns]\n\n    # Define save path\n    out_csv_path = os.path.join(deserialised_folder, f"{reg_name}.csv")\n    df.to_csv(out_csv_path, index=False)\n\n    # OR Save as .pkl\n    # pkl_path = os.path.join(deserialised_folder, f"{reg_name}.pkl")\n    # df.to_pickle(pkl_path)\n\n    all_dataframes[reg_name] = df\n    print(f"Saved {reg_name} to {out_csv_path}")\n    # print(f"Saved {reg_name} to {pkl_path}")\n    '

### Merge dataframes

In [None]:

merged_df = None
for df in all_dataframes.values():
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, left_index=True, right_index=True, how="outer")

if merged_df is not None:
    merged_df.sort_index(inplace=True)  # sort by datetime index
    merged_csv_path = os.path.join(deserialised_folder, "AllHarpEvents.csv")
    merged_df.to_csv(merged_csv_path)

    # OR Save as .pkl
    # merged_pkl_path = os.path.join(deserialised_folder, "AllHarpEvents.pkl")
    # merged_df.to_pickle(merged_pkl_path)

    print(f"Merged dataframe saved to {merged_csv_path}")
    #print(f"Merged dataframe saved to {merged_pkl_path}")


Merged dataframe saved to Z:\public\projects\JeKr_130524_Joystick\rotary-encoder\bonsai_joystick_data\ML_165\bestsessions\ses-001_date-20251112T160916\Behavior\../Deserialised\AllHarpEvents.csv


### Merge dataframes more efficiently?

In [None]:
'''
import os
import pandas as pd 

# Path to your deserialised folder
deserialised_folder = r"Z:\public\projects\JeKr_130524_Joystick\rotary-encoder\bonsai_joystick_data\JK_008\bestsessions\ses-001_date-20251015T163410\Deserialised"

# Define expected files
files = {
    "OutputSet": os.path.join(deserialised_folder, "OutputSet.csv"),
    "TimestampSeconds": os.path.join(deserialised_folder, "TimestampSeconds.csv"),
    "EncoderData": os.path.join(deserialised_folder, "EncoderData.csv"),
}

dfs = {}

# --- 1Ô∏è‚É£ Load OutputSet (keep OutputSet_DO2 only) ---
if os.path.exists(files["OutputSet"]):
    df_out = pd.read_csv(files["OutputSet"], usecols=lambda c: "OutputSet_DO2" in c)
    dfs["OutputSet"] = df_out
    print(f"‚úÖ Loaded OutputSet ({df_out.shape[0]} rows)")
else:
    print("‚ö†Ô∏è OutputSet.csv not found")

# --- 2Ô∏è‚É£ Load TimestampSeconds ---
if os.path.exists(files["TimestampSeconds"]):
    df_ts = pd.read_csv(files["TimestampSeconds"])
    dfs["TimestampSeconds"] = df_ts
    print(f"‚úÖ Loaded TimestampSeconds ({df_ts.shape[0]} rows)")
else:
    print("‚ö†Ô∏è TimestampSeconds.csv not found")

# --- 3Ô∏è‚É£ Load EncoderData (only first two columns) ---
if os.path.exists(files["EncoderData"]):
    df_enc = pd.read_csv(files["EncoderData"], usecols=[0, 1])
    dfs["EncoderData"] = df_enc
    print(f"‚úÖ Loaded EncoderData ({df_enc.shape[0]} rows, first two columns only)")
else:
    print("‚ö†Ô∏è EncoderData.csv not found")

# --- 4Ô∏è‚É£ Merge all loaded DataFrames on their index (row-wise alignment) ---
if dfs:
    merged_df = pd.concat(dfs.values(), axis=1)
    merged_df.reset_index(drop=True, inplace=True)

    merged_csv_path = os.path.join(deserialised_folder, "AllHarpEvents.csv")
    merged_df.to_csv(merged_csv_path, index=False)
    print(f"üíæ Saved AllHarpEvents.csv with {merged_df.shape[0]} rows and {merged_df.shape[1]} columns")
else:
    print("‚ùå No dataframes loaded, check file paths or column names")
'''


'\nimport os\nimport pandas as pd \n\n# Path to your deserialised folder\ndeserialised_folder = r"Z:\\public\\projects\\JeKr_130524_Joystick\rotary-encoder\x08onsai_joystick_data\\JK_008\x08estsessions\\ses-001_date-20251015T163410\\Deserialised"\n\n# Define expected files\nfiles = {\n    "OutputSet": os.path.join(deserialised_folder, "OutputSet.csv"),\n    "TimestampSeconds": os.path.join(deserialised_folder, "TimestampSeconds.csv"),\n    "EncoderData": os.path.join(deserialised_folder, "EncoderData.csv"),\n}\n\ndfs = {}\n\n# --- 1Ô∏è‚É£ Load OutputSet (keep OutputSet_DO2 only) ---\nif os.path.exists(files["OutputSet"]):\n    df_out = pd.read_csv(files["OutputSet"], usecols=lambda c: "OutputSet_DO2" in c)\n    dfs["OutputSet"] = df_out\n    print(f"‚úÖ Loaded OutputSet ({df_out.shape[0]} rows)")\nelse:\n    print("‚ö†Ô∏è OutputSet.csv not found")\n\n# --- 2Ô∏è‚É£ Load TimestampSeconds ---\nif os.path.exists(files["TimestampSeconds"]):\n    df_ts = pd.read_csv(files["TimestampSeconds"]