In [None]:
# -----------------------------------------------------------------------------
#
# Author: Rambod Taheran
# Date: 23 August 2025
#
# Description:
# This Jupyter notebook generates jaw servo movement data (jawTrack[]) from
# speech audio input or phoneme timing. It is designed to provide realistic,
# synchronized jaw motion for the Expressive Robotic Mask during narration mode.
#
# Key Features:
#   - Processes speech or phoneme sequences into frame-by-frame jaw positions
#   - Outputs a jawTrack[] array (us values) for playback on the mask
#   - Timing calibrated for 50 Hz updates (to align with TaskNarrate on mask)
#   - Allows experimenting with different mapping strategies (energy-based,
#     phoneme-driven, or simple amplitude envelope)
#   - Exports generated data for direct inclusion in ESP32 firmware (jaw_track.h)
#
# This notebook complements the mask_narration.ino and
# controller_narration.ino by providing the offline preprocessing tool
# that powers Narration Mode.
# -----------------------------------------------------------------------------


In [None]:
!pip -q install librosa soundfile numpy

import numpy as np
import librosa, csv, textwrap, os
from google.colab import files


In [None]:
print("Upload your narration MP3…")
uploaded = files.upload()  # pick your file, e.g., narration.mp3
mp3_path = list(uploaded.keys())[0]
print("Using:", mp3_path)


Upload your narration MP3…


Saving unknown_2025.08.22-23.15.mp3 to unknown_2025.08.22-23.15.mp3
Using: unknown_2025.08.22-23.15.mp3


In [None]:
# ====== CONFIG ======
FRAME_MS = 20            # 50 Hz frames to match servo loop
MIN_US, MAX_US = 1216, 1750   # Jaw closed → open
DEAD = 0.02              # deadzone to ignore tiny noise (0..1 after normalize)
ALPHA = 0.6              # smoothing (0=no smoothing, 0.8=heavy smoothing)
TARGET_SR = None         # None = keep original; or set e.g., 16000
# ====================

# Load mono
y, sr = librosa.load(mp3_path, sr=TARGET_SR, mono=True)
frame_len = int(sr * FRAME_MS / 1000)

# Frame RMS (loudness)
env = []
for i in range(0, len(y), frame_len):
    seg = y[i:i+frame_len]
    if len(seg) == 0: break
    rms = float(np.sqrt(np.mean(seg**2)))
    env.append(rms)

# Normalize by peak (avoid div-by-zero)
peak = max(env) if env else 1.0
if peak == 0: peak = 1.0
env = [e/peak for e in env]

# Deadzone, smooth, map to microseconds
jaw_us = []
state = 0.0
span = MAX_US - MIN_US
for e in env:
    # deadzone + re-scale
    e = 0.0 if e < DEAD else (e - DEAD) / (1.0 - DEAD)
    # clamp after deadzone
    e = min(max(e, 0.0), 1.0)
    # exponential smoothing
    state = ALPHA*state + (1.0-ALPHA)*e
    us = int(round(MIN_US + state * span))
    jaw_us.append(us)

print(f"Frames: {len(jaw_us)}  (~{len(jaw_us)*FRAME_MS/1000:.2f}s)")


Frames: 683  (~13.66s)


In [None]:
# CSV (one value per frame)
csv_name = "jaw_track.csv"
with open(csv_name, "w", newline="") as f:
    cw = csv.writer(f)
    cw.writerow(["us"])
    cw.writerows([[u] for u in jaw_us])

# C array header
h_name = "jaw_track.h"
wrapped = ", ".join(str(u) for u in jaw_us)
# make the array readable (wrap to ~16 per line)
lines = textwrap.wrap(wrapped, width=16*6)  # crude width; adjust if needed
with open(h_name, "w") as f:
    f.write("// Auto-generated from narration MP3 @ 50 Hz\n")
    f.write("// Limits mapped to 1216..1750 us (jaw)\n")
    f.write("const uint16_t jawTrack[] = {\n")
    for ln in lines:
        f.write("  " + ln + ",\n")
    f.write("};\n")
    f.write("const int jawFrames = sizeof(jawTrack)/sizeof(jawTrack[0]);\n")

print("Wrote:", csv_name, "and", h_name)
files.download(csv_name)
files.download(h_name)


Wrote: jaw_track.csv and jaw_track.h


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import csv
from google.colab import files

# ====== SETTINGS ======
GAIN     = 2.5          # >1.0 = more movement (try 1.5–2.5)
NEUTRAL  = 1216         # closed mouth baseline
MIN_US   = 1216         # jaw limits
MAX_US   = 1750
PER_LINE = 16           # formatting in header
# ======================

print("Upload jaw_track.csv (one number per row OR comma separated).")
uploaded = files.upload()
csv_name = list(uploaded.keys())[0]

# Read numbers (robust: handles commas or one per line)
values = []
with open(csv_name, newline='') as f:
    r = csv.reader(f)
    for row in r:
        for cell in row:
            cell = cell.strip()
            if cell:
                try:
                    values.append(int(float(cell)))
                except:
                    pass

print(f"Loaded {len(values)} frames.")

# Apply boost around neutral + clamp
boosted = []
span = MAX_US - NEUTRAL
for v in values:
    dev = v - NEUTRAL
    new_v = int(round(NEUTRAL + dev * GAIN))
    new_v = max(MIN_US, min(MAX_US, new_v))
    boosted.append(new_v)

# Write header
h_name = "jaw_track.h"
with open(h_name, "w") as f:
    f.write("// Auto-generated jaw track (amplified)\n")
    f.write("#pragma once\n#include <stdint.h>\n\n")
    f.write("const uint16_t jawTrack[] = {\n")
    for i, v in enumerate(boosted):
        f.write(f"  {v}" + ("," if i < len(boosted)-1 else ""))
        if (i+1) % PER_LINE == 0:
            f.write("\n")
    f.write("\n};\n")
    f.write("const int jawFrames = sizeof(jawTrack)/sizeof(jawTrack[0]);\n")

print(f"Wrote {h_name} with {len(boosted)} frames (~{len(boosted)/50:.1f}s @50Hz).")
files.download(h_name)


Upload jaw_track.csv (one number per row OR comma separated).


Saving jaw_track (5).csv to jaw_track (5).csv
Loaded 683 frames.
Wrote jaw_track.h with 683 frames (~13.7s @50Hz).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>