In [1]:
# Define the base directory paths
audio_base_path = r"D:\Mici_Princ\Mici_Princ_html\MP.mp3"
transcript_dir_path = r"D:\Mici_Princ\Mici_Princ_html\MP.json"  # Directory containing MP_00.json to MP_28.json
output_file_path = r"D:\Mici_Princ\Mici_Princ_html\all_mp3s.json"

In [2]:
import os
import re
import json

# Define the base directory paths
audio_base_path = r"D:\Mici_Princ\Mici_Princ_html\MP.mp3"
output_file_path = r"D:\Mici_Princ\Mici_Princ_html\all_mp3_files.json"

def extract_mp3_files(base_directory):
    mp3_files_dict = {}
    for filename in os.listdir(base_directory):
        if filename.endswith('.mp3'):
            match = re.match(r'(MP_\d{2})_(\d+\.\d+)-(\d+\.\d+)\.mp3', filename)
            if match:
                base_name = match.group(1)
                start_time = float(match.group(2))
                end_time = float(match.group(3))
                
                # Initialize the list for the base name if not present
                if base_name not in mp3_files_dict:
                    mp3_files_dict[base_name] = []
                
                # Append the file details to the respective base name
                mp3_files_dict[base_name].append({
                    "filename": filename,
                    "start_time": start_time,
                    "end_time": end_time
                })
    
    # Sort each list of files by start time
    for base_name in mp3_files_dict:
        mp3_files_dict[base_name].sort(key=lambda x: x["start_time"])
    
    return mp3_files_dict

def main():
    # Extract MP3 files and their details
    all_mp3_files = extract_mp3_files(audio_base_path)
    
    # Save the data into a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(all_mp3_files, outfile, indent=2, ensure_ascii=False)
    
    print('MP3 file data extracted and saved successfully!')

if __name__ == '__main__':
    main()


MP3 file data extracted and saved successfully!


In [6]:
import os
import json
import re

# Define the base directory paths
transcript_dir_path = r"D:\Mici_Princ\Mici_Princ_html\MP.json"  # Directory containing MP_00.json to MP_28.json
mp3_files_path = r"D:\Mici_Princ\Mici_Princ_html\all_mp3_files.json"
output_file_path = r"D:\Mici_Princ\Mici_Princ_html\consolidated_transcript.json"

# Define tolerance for floating-point comparisons
FLOAT_TOLERANCE = 0.01  # Adjust this tolerance as needed

def load_mp3_files(mp3_files_path):
    with open(mp3_files_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

import re

def preprocess_text(text):
    # Record positions of newlines
    newline_positions = [m.start() for m in re.finditer(r'\n', text)]

    # Remove newline characters
    text = text.replace('\n', ' ')

    # Continue with existing preprocessing steps
    text = re.sub(r'\•\s+', '•_', text)  # Combine "• " with following word
    text = re.sub(r'\-\s+', '-_', text)  # Combine "- " with following word
    text = re.sub(r'(\S) – (\S)', r'\1– \2', text)  # Handle long hyphens
    text = re.sub(r'(\S)-(\S)', r'\1- \2', text)  # Split hyphenated words
    text = re.sub(r'(\S)\s*\.\.\.', r'\1...', text)  # Append ellipses
    text = re.sub(r'„\s+(\w)', r'„\1', text)
    text = re.sub(r'„\s+(\w)', r'„\1', text)

    return text, newline_positions

    
def sync_words_to_mp3(mp3_files, transcript):
    for mp3 in mp3_files:
        mp3['words'] = []

    for entry in transcript:
        speaker = entry['speaker']
        text, newline_positions = preprocess_text(entry['text'])
        words = text.split(' ')
        timing_words = entry['words']

        # Debugging output
        if len(words) != len(timing_words):
            print(f"Warning: Word count mismatch. Transcript: {len(words)}, Timing Data: {len(timing_words)}")
            print(f"Processed transcript text: {text}")
            print(f"words: \n {words}")
            print(f"Number of words: {len(words)}")
            print(f"Number of entry words data: {len(timing_words)}")

        for i, word in enumerate(words):
            if i >= len(timing_words):
                print(f"Warning: Index {i} is out of range for entry['words']")
                # If the timing data is shorter, skip to the next word
                continue

            word_start_time = timing_words[i]['time_s']
            word_end_time = timing_words[i]['time_e']
            
            # Find the mp3 file to append the word to
            found = False
            for mp3 in mp3_files:
                if (mp3['start_time'] - FLOAT_TOLERANCE <= word_start_time <= mp3['end_time'] + FLOAT_TOLERANCE):
                    mp3['words'].append({
                        "word": word,
                        "start_time": word_start_time,
                        "end_time": word_end_time,
                        "speaker": speaker
                    })
                    found = True
                    break
            
            if not found:
                print(f"Warning: No MP3 file found for word starting at {word_start_time}")

    return mp3_files

def transform_data(mp3_files):
    transformed = {}

    for mp3 in mp3_files:
        mp3_filename = mp3['filename']
        mp3_start_time = mp3['start_time']
        mp3_end_time = mp3['end_time']

        # Initialize the structure for this MP3 file
        transformed[mp3_filename] = {
            "Speaker": {},
            "start": mp3_start_time,
            "end": mp3_end_time
        }

        for word in mp3['words']:
            speaker = word['speaker']

            if speaker not in transformed[mp3_filename]["Speaker"]:
                transformed[mp3_filename]["Speaker"][speaker] = []

            transformed[mp3_filename]["Speaker"][speaker].append({
                "word": word['word'],
                "start": word['start_time'],
                "end": word['end_time']
            })

    return transformed

def main():
    # Load MP3 files data
    mp3_files_data = load_mp3_files(mp3_files_path)
    
    # Prepare the output dictionary
    consolidated_data = {}
    
    # Process each MP_xx.json file
    for base_name, mp3_files in mp3_files_data.items():
        mp3_files = sorted(mp3_files, key=lambda x: x["start_time"])  # Ensure sorting by start_time
        transcript_file_path = os.path.join(transcript_dir_path, f"{base_name}.json")
        if os.path.exists(transcript_file_path):
            transcript_data = load_transcript(transcript_file_path)
            synced_data = sync_words_to_mp3(mp3_files, transcript_data)
            transformed_data = transform_data(synced_data)
            consolidated_data[base_name] = transformed_data
        else:
            print(f"Transcript file for {base_name} not found.")
    
    # Save the consolidated data into a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(consolidated_data, outfile, indent=2, ensure_ascii=False)
    
    print('Consolidated transcript data saved successfully!')

if __name__ == '__main__':
    main()


Consolidated transcript data saved successfully!


In [8]:
import os
import json
import re

# Define the base directory paths
transcript_dir_path = r"D:\Mici_Princ\Mici_Princ_html\MP.json"  # Directory containing MP_00.json to MP_28.json
mp3_files_path = r"D:\Mici_Princ\Mici_Princ_html\all_mp3_files.json"
output_file_path = r"D:\Mici_Princ\Mici_Princ_html\consolidated_transcript_0.json"

# Define tolerance for floating-point comparisons
FLOAT_TOLERANCE = 0.01  # Adjust this tolerance as needed

def load_mp3_files(mp3_files_path):
    with open(mp3_files_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def preprocess_text(text):
    # Record positions of newlines
    newline_positions = [m.start() for m in re.finditer(r'\n', text)]

    # Remove newline characters
    text = text.replace('\n', ' ')

    # Continue with existing preprocessing steps
    text = re.sub(r'\•\s+', '•_', text)  # Combine "• " with following word
    text = re.sub(r'\-\s+', '-_', text)  # Combine "- " with following word
    text = re.sub(r'(\S) – (\S)', r'\1– \2', text)  # Handle long hyphens
    text = re.sub(r'(\S)-(\S)', r'\1- \2', text)  # Split hyphenated words
    text = re.sub(r'(\S)\s*\.\.\.', r'\1...', text)  # Append ellipses
    text = re.sub(r'„\s+(\w)', r'„\1', text)

    return text, newline_positions

In [9]:
def sync_words_to_mp3(mp3_files, transcript):
    for mp3 in mp3_files:
        mp3['words'] = []

    for entry in transcript:
        speaker = entry['speaker']
        text, newline_positions = preprocess_text(entry['text'])
        words = text.split(' ')
        timing_words = entry['words']

        if len(words) != len(timing_words):
            print(f"Warning: Word count mismatch. Transcript: {len(words)}, Timing Data: {len(timing_words)}")
            print(f"Processed transcript text: {text}")
            print(f"words: \n {words}")
            print(f"Number of words: {len(words)}")
            print(f"Number of entry words data: {len(timing_words)}")

        for i, word in enumerate(words):
            if i >= len(timing_words):
                print(f"Warning: Index {i} is out of range for entry['words']")
                continue

            word_start_time = timing_words[i]['time_s']
            word_end_time = timing_words[i]['time_e']
            
            found = False
            for mp3 in mp3_files:
                if (mp3['start_time'] - FLOAT_TOLERANCE <= word_start_time <= mp3['end_time'] + FLOAT_TOLERANCE):
                    adjusted_start_time = round(word_start_time - mp3['start_time'], 2)
                    adjusted_end_time = round(word_end_time - mp3['start_time'], 2)
                    
                    mp3['words'].append({
                        "word": word,
                        "start_time": adjusted_start_time,
                        "end_time": adjusted_end_time,
                        "speaker": speaker
                    })
                    found = True
                    break
            
            if not found:
                print(f"Warning: No MP3 file found for word starting at {word_start_time}")

    return mp3_files


In [12]:
def transform_data(mp3_files):
    transformed = {}

    for mp3 in mp3_files:
        mp3_filename = mp3['filename']
        mp3_start_time = mp3['start_time']
        mp3_end_time = mp3['end_time']

        # Initialize the structure for this MP3 file
        transformed[mp3_filename] = {
            "words": [],
            "start": 0,  # After adjustment
            "end": round(mp3_end_time - mp3_start_time, 2)  # Adjusted end time
        }

        for word in mp3['words']:
            transformed[mp3_filename]["words"].append({
                "word": word['word'],
                "start": round(word['start_time'], 2),
                "end": round(word['end_time'], 2),
                "speaker": word['speaker']
            })

    return transformed


In [13]:

def main():
    # Load MP3 files data
    mp3_files_data = load_mp3_files(mp3_files_path)
    
    # Prepare the output dictionary
    consolidated_data = {}
    
    # Process each MP_xx.json file
    for base_name, mp3_files in mp3_files_data.items():
        mp3_files = sorted(mp3_files, key=lambda x: x["start_time"])  # Ensure sorting by start_time
        transcript_file_path = os.path.join(transcript_dir_path, f"{base_name}.json")
        if os.path.exists(transcript_file_path):
            transcript_data = load_transcript(transcript_file_path)
            synced_data = sync_words_to_mp3(mp3_files, transcript_data)
            transformed_data = transform_data(synced_data)
            consolidated_data[base_name] = transformed_data
        else:
            print(f"Transcript file for {base_name} not found.")
    
    # Save the consolidated data into a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(consolidated_data, outfile, indent=2, ensure_ascii=False)
    
    print('Consolidated transcript_0 data saved successfully!')

if __name__ == '__main__':
    main()


Consolidated transcript_0 data saved successfully!
