In [4]:
import speech_recognition as sr
from pydub import AudioSegment

In [12]:
def transcribe_audio(file_path):
    recognizer = sr.Recognizer()
    audio_file = sr.AudioFile(file_path)
    with audio_file as source:
        audio_data = recognizer.record(source)
    return recognizer.recognize_google(audio_data, show_all=True)

In [13]:
def find_word_timestamps(transcript, target_phrase):
    word_timestamps = []
    for result in transcript["results"]:
        for alternative in result["alternatives"]:
            words_info = alternative.get("words", [])
            for word_info in words_info:
                word = word_info["word"]
                if word.lower() in target_phrase.lower():
                    start_time = word_info["startTime"]
                    end_time = word_info["endTime"]
                    start_offset = max(0, int(start_time[:-1]) / 1000 - 0.07)  # Convert to seconds and add offset
                    end_offset = int(end_time[:-1]) / 1000 + 0.02  # Convert to seconds and add offset
                    word_timestamps.append((start_offset, end_offset))
    return word_timestamps

In [14]:
def split_audio(file_path, timestamps, output_paths):
    audio = AudioSegment.from_wav(file_path)
    for (start, end), output_path in zip(timestamps, output_paths):
        extract = audio[start * 1000:end * 1000]  # convert to milliseconds
        extract.export(output_path, format="wav")


In [17]:
def main():
    input_audio_path = "E:\\voiceData\Data-2.wav"  # path to your input wav file
    phrases_to_find = [
        "London, the capital city of the United",
        "the City of London is the historic",
        "of London. The city's boundaries have remained",
        "daytime workforce, the City is always alive"
    ]
    output_files = [
        "Suara-1.wav",
        "Suara-2.wav",
        "Suara-3.wav",
        "Suara-4.wav"
    ]

    transcript = transcribe_audio(input_audio_path)
    if "results" not in transcript:
        print("Transcription did not return expected results format.")
        return
    
    timestamps = []
    for phrase in phrases_to_find:
        phrase_timestamps = find_word_timestamps(transcript, phrase)
        if phrase_timestamps:
            timestamps.append(phrase_timestamps[0])  # Assume we only need the first occurrence
        else:
            print(f"Phrase '{phrase}' not found in the audio.")

    if len(timestamps) == len(output_files):
        split_audio(input_audio_path, timestamps, output_files)
    else:
        print("Mismatch between number of found phrases and output files.")

In [18]:
if __name__ == "__main__":
    main()

Transcription did not return expected results format.
