# Split audio by silence

This script processes a folder of audio files,
detects silence-based segments in each audio,**
matches each segment to corresponding transcript text
from a JSON file (e.g. AWS Transcribe),
and exports the results as segmented MP3s + a CSV.

Useful for turning long recordings into timestamped snippets.

# Connect to Google Drive

In [1]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Required Packages

In [2]:
!apt-get install -y ffmpeg
!apt-get install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (73.2 kB/s)
Selecting previously unselected package tree.
(Reading database ... 126281 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


#Import Libraries

In [3]:
import os
import glob
import json
import pandas as pd
from tqdm import tqdm  # For progress bars
from os.path import join
from pydub import AudioSegment
from pydub.silence import split_on_silence  # Detects silent gaps in audio

# Explore Data structure

In [5]:
!tree '/content/drive/MyDrive/a_tutorial_notebooks/data'

[01;34m/content/drive/MyDrive/a_tutorial_notebooks/data[0m
└── [01;34m1[0m
    ├── [01;34mprocessed[0m
    │   ├── [01;35m2c9f9799-b379-4c6e-91dd-38a1897d4ff6.mp3[0m
    │   ├── [00m2c9f9799-b379-4c6e-91dd-38a1897d4ff6.mp4[0m
    │   └── [00mthumbnail.png[0m
    └── [01;34mtranscription[0m
        └── [00mtranscribe_output.json[0m

3 directories, 4 files


#Splits the input audio file into segments based on silence.

In [6]:
# -------------------------------------------
# Function: split_audio_by_silence_segments
# -------------------------------------------
def split_audio_by_silence_segments(input_file, silence_threshold=-50, min_silence_duration=200):
    """
    Splits the input audio file into segments based on silence.

    Args:
        input_file (str): Path to the audio file (e.g., .mp3 or .wav)
        silence_threshold (int): Volume in dBFS below which is considered silence
        min_silence_duration (int): Minimum silence length (in ms) to split on

    Returns:
        list of AudioSegment: Each chunk is a speech segment between silent parts
    """
    audio = AudioSegment.from_file(input_file)

    print(f"Loaded audio with {audio.frame_count()} frames.")

    # Automatically split audio where silence is detected
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_duration,
        silence_thresh=silence_threshold,
        keep_silence=True  # Adds padding so transitions aren't too abrupt
    )
    return segments

# Calculates the start and end times (in seconds) of each segment.


In [7]:
# -------------------------------------------
# Function: segment_times
# -------------------------------------------
def segment_times(segments):
    """
    Calculates the start and end times (in seconds) of each segment.

    Args:
        segments (list of AudioSegment): Segments from silence splitting

    Returns:
        list of tuples: Each tuple is (start_time, end_time) of a segment
    """
    start_time = 0
    timestamps = []

    for segment in segments:
        # Duration in seconds = total frames / frame rate
        length = segment.frame_count() / segment.frame_rate
        end_time = start_time + length
        timestamps.append((start_time, end_time))
        start_time = end_time  # Start next segment where this one ends

    return timestamps


# Save each audio segment as a separate .mp3 file.

In [8]:
# -------------------------------------------
# Function: export_chunks
# -------------------------------------------
def export_chunks(chunk_path, segments):
    """
    Saves each audio segment as a separate .mp3 file.

    Args:
        chunk_path (str): Output directory for chunks
        segments (list of AudioSegment): List of split audio segments
    """
    os.makedirs(chunk_path, exist_ok=True)

    for i, segment in enumerate(segments):
        output_file = os.path.join(chunk_path, f"chunk_{i}.mp3")
        segment.export(output_file, format="mp3")  # Save in mp3 format

# Converts a transcription JSON file to a DataFrame.

In [9]:
# -------------------------------------------
# Function: json_to_df
# -------------------------------------------
def json_to_df(input_file):
    """
    Converts a transcription JSON file to a DataFrame.

    Expects JSON in AWS Transcribe format:
    - Each word has a start_time and end_time.
    - Punctuation marks have no time, so we append them to the last word.

    Args:
        input_file (str): Path to the transcription JSON file

    Returns:
        pd.DataFrame: Columns are 'stime', 'etime', and 'word'
    """
    with open(input_file) as f:
        data = json.load(f)

    start_time, end_time, word = [], [], []

    for item in data['results']['items']:
        if "start_time" in item:
            start_time.append(float(item['start_time']))
            end_time.append(float(item['end_time']))
            word.append(item['alternatives'][0]['content'])
        else:
            # If it's punctuation (no timing), append it to the previous word
            word[-1] += item['alternatives'][0]['content']

    return pd.DataFrame({'stime': start_time, 'etime': end_time, 'word': word})

# -------------------------------------------

# Match word timestamps to audio segments, generating aligned transcript snippets.


In [10]:
# Function: final_df
# -------------------------------------------
def final_df(timestamps, word_df):
    """
    Matches word timestamps to audio segments, generating aligned transcript snippets.

    Args:
        timestamps (list): List of (start_time, end_time) tuples for each audio chunk
        word_df (pd.DataFrame): Word-level transcript with start/end times

    Returns:
        pd.DataFrame: One row per audio chunk, with aligned sentence and timing
    """
    start_time, end_time, sentence, chunk = [], [], [], []

    for idx, (start, end) in enumerate(timestamps):
        # Filter words that fall within the current segment
        temp_df = word_df[(word_df['stime'] >= start) & (word_df['etime'] <= end)]
        text = ' '.join(temp_df['word'].tolist())

        if len(text.strip()) > 0:
            start_time.append(start)
            end_time.append(end)
            sentence.append(text)
            chunk.append(idx)

    return pd.DataFrame({
        'start_time': start_time,
        'end_time': end_time,
        'sentence': sentence,
        'chunk': chunk
    })

# Main Script: Process All Audio Files in Folder

In [11]:

# -------------------------------------------
# Main Script: Process All Audio Files in Folder
# -------------------------------------------

# 🛣️ Define root path
data_root = "/content/drive/MyDrive/a_tutorial_notebooks/data"


# Path to folder containing subfolders (named numerically) with audio and transcript data
csv_files = data_root
csv_root = data_root
csv_path = "media/"  # Subfolder containing .mp3 files

# Loop through all folders in the root
for folder in tqdm(os.listdir(csv_files), desc="Processing folders"):
    if folder.isdigit():  # Process only numeric folders
        root_folder = os.path.join(csv_root, folder)
        mp3_folder = os.path.join(root_folder, csv_path)
        chunk_path = os.path.join(root_folder, "audio_clips")
        json_file = os.path.join(root_folder, "transcription", "transcribe_output.json")
        output_csv = os.path.join(root_folder, f"{folder}_splits.csv")

        for file in os.listdir(mp3_folder):
            if file.endswith(".mp3"):
                input_audio = os.path.join(mp3_folder, file)

                # Load transcript, split audio, and export results
                transcript_df = json_to_df(json_file)
                segments = split_audio_by_silence_segments(input_audio)
                export_chunks(chunk_path, segments)
                timestamps = segment_times(segments)
                final_transcript_df = final_df(timestamps, transcript_df)
                final_transcript_df.rename(columns={"chunks": "segments"}, inplace=True)

                # Save timestamped sentence segments to CSV
                final_transcript_df.to_csv(output_csv, index=False)


Processing folders:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded audio with 89839200.0 frames.


Processing folders: 100%|██████████| 1/1 [04:24<00:00, 264.35s/it]
