In [5]:
import os
from shutil import copy2

def unique_filename(file_path):
    """
    Generate a unique filename by appending a counter to the base name if the file already exists.

    Parameters:
    - file_path (str): The initial file path for which to check uniqueness.

    Returns:
    - str: A unique file path. If the initial file path does not exist, it is returned as is;
           otherwise, a counter is appended to the base name until a unique name is generated.
    """
    base, extension = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{extension}"
        counter += 1
    return file_path

# Base directory where the folders with FLAC files are located
base_dir = 'speech_data'
# Target directory where all FLAC files will be copied
target_data_dir = 'all_Data'
# Target subdirectory for all transcript files
target_transcripts_dir = os.path.join(target_data_dir, 'all_Transcripts')

# Create the target directories if they don't exist
os.makedirs(target_data_dir, exist_ok=True)
os.makedirs(target_transcripts_dir, exist_ok=True)

# Iterate over each subfolder and file in the base directory
for subdir, dirs, files in os.walk(base_dir):
    for file in files:
        # Full path to the source file
        file_path = os.path.join(subdir, file)
        # Determine the target directory based on file type
        if file.endswith('.flac'):
            target_dir = target_data_dir
        elif file.endswith('.txt'):
            target_dir = target_transcripts_dir
        else:
            # Skip any non-flac and non-txt files
            continue

        # Full path for the target file
        target_file_path = os.path.join(target_dir, file)
        
        # Ensure a unique filename in case of duplicates
        target_file_path = unique_filename(target_file_path)

        # Copy the file to the target directory
        copy2(file_path, target_file_path)
        print(f"Copied {file} to {target_dir}")

print("Finished copying all files.")


Copied 1462-170138-0019.flac to all_Data
Copied 1462-170138-0022.flac to all_Data
Copied 1462-170138-0027.flac to all_Data
Copied 1462-170138-0024.flac to all_Data
Copied 1462-170138-0015.flac to all_Data
Copied 1462-170138-0000.flac to all_Data
Copied 1462-170138-0017.flac to all_Data
Copied 1462-170138-0014.flac to all_Data
Copied 1462-170138-0013.flac to all_Data
Copied 1462-170138-0020.flac to all_Data
Copied 1462-170138-0012.flac to all_Data
Copied 1462-170138-0018.flac to all_Data
Copied 1462-170138-0001.flac to all_Data
Copied 1462-170138-0009.flac to all_Data
Copied 1462-170138-0023.flac to all_Data
Copied 1462-170138-0006.flac to all_Data
Copied 1462-170138-0005.flac to all_Data
Copied 1462-170138-0004.flac to all_Data
Copied 1462-170138-0026.flac to all_Data
Copied 1462-170138.trans.txt to all_Data/all_Transcripts
Copied 1462-170138-0008.flac to all_Data
Copied 1462-170138-0011.flac to all_Data
Copied 1462-170138-0010.flac to all_Data
Copied 1462-170138-0016.flac to all_Data


In [6]:
import os
from pydub import AudioSegment

def convert_flac_to_wav(flac_file_path, wav_file_path):
    """
    Convert an audio file from FLAC format to WAV format.

    Parameters:
    - flac_file_path (str): The file path of the source FLAC file.
    - wav_file_path (str): The file path where the WAV file will be saved.
    """
    # Load the FLAC file
    audio = AudioSegment.from_file(flac_file_path, 'flac')
    # Export the audio in WAV format
    audio.export(wav_file_path, format='wav')

# Directory containing the original FLAC files
source_dir = 'all_Data'
# Directory where the WAV files will be saved
target_dir = 'all_Data_wav'

# Create the target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Iterate over all files in the source directory
for file in os.listdir(source_dir):
    if file.endswith('.flac'):
        # Full path to the source FLAC file
        flac_file_path = os.path.join(source_dir, file)
        # Full path to the target WAV file (change extension to .wav)
        wav_file_path = os.path.join(target_dir, file.replace('.flac', '.wav'))
        # Convert the FLAC file to WAV format and save it
        convert_flac_to_wav(flac_file_path, wav_file_path)
        print(f"Converted {file} to WAV format.")

print("Finished converting all FLAC files to WAV format.")


Converted 6295-244435-0023.flac to WAV format.
Converted 2035-147961-0005.flac to WAV format.
Converted 3576-138058-0011.flac to WAV format.
Converted 7850-111771-0006.flac to WAV format.
Converted 2086-149214-0001.flac to WAV format.
Converted 777-126732-0038.flac to WAV format.
Converted 8297-275155-0028.flac to WAV format.
Converted 6295-244435-0027.flac to WAV format.
Converted 2086-149220-0011.flac to WAV format.
Converted 1993-147965-0001.flac to WAV format.
Converted 84-121550-0014.flac to WAV format.
Converted 2803-154328-0016.flac to WAV format.
Converted 6241-61946-0023.flac to WAV format.
Converted 7850-281318-0021.flac to WAV format.
Converted 2078-142845-0045.flac to WAV format.
Converted 2035-152373-0008.flac to WAV format.
Converted 6241-61943-0015.flac to WAV format.
Converted 2086-149220-0048.flac to WAV format.
Converted 2803-161169-0003.flac to WAV format.
Converted 2412-153948-0013.flac to WAV format.
Converted 777-126732-0044.flac to WAV format.
Converted 5338-2844

In [7]:
import json
import os

def create_dataset_json(transcripts_dir, audio_dir, output_json_path):
    """
    Create a JSON dataset file from transcripts and corresponding audio files.

    Parameters:
    - transcripts_dir (str): Path to the directory containing transcript files.
    - audio_dir (str): Path to the directory containing audio files.
    - output_json_path (str): Path where the output JSON file will be saved.
    """
    # This list will store all data points (audio file paths and their transcriptions)
    data_points = []

    # List all transcript files in the given directory with a specific extension
    transcript_files = [f for f in os.listdir(transcripts_dir) if f.endswith('.trans.txt')]

    # Process each transcript file
    for transcript_file in transcript_files:
        with open(os.path.join(transcripts_dir, transcript_file), 'r') as f:
            lines = f.readlines()
            for line in lines:
                # Each line contains an audio file name and the transcription, separated by a space
                parts = line.strip().split(' ', 1)
                if len(parts) == 2:
                    audio_filename = parts[0] + '.wav'
                    transcription = parts[1]
                    audio_filepath = os.path.join(audio_dir, audio_filename)
                    
                    # Check if the corresponding audio file exists
                    if os.path.isfile(audio_filepath):
                        # Add the audio file path and transcription to the dataset
                        data_points.append({
                            'audio_filepath': audio_filepath,
                            'transcription': transcription
                        })

    # Write the data points to a JSON file
    with open(output_json_path, 'w') as json_file:
        json.dump(data_points, json_file, indent=4)

    print(f"Created dataset with {len(data_points)} entries.")

# Paths to the required directories and output JSON file
transcripts_dir = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/all_Transcripts'
audio_dir = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav'
output_json_path = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/speech_dataset.json'

# Create the JSON dataset
create_dataset_json(transcripts_dir, audio_dir, output_json_path)


Created dataset with 2703 entries.


In [3]:
import torch
import json
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def load_pretrained_model_and_processor():
    """
    Load a pre-trained Wav2Vec 2.0 model and its processor.

    Returns:
    - processor: The Wav2Vec2Processor associated with the model.
    - model: The Wav2Vec2ForCTC model ready for transcription prediction.
    """
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
    return processor, model

def speech_file_to_array_fn(audio_filepath):
    """
    Convert an audio file to an array format expected by the model.

    Parameters:
    - audio_filepath (str): Path to the audio file to convert.

    Returns:
    - speech_array: The audio data as an array.
    - sampling_rate: The sampling rate of the audio file.
    """
    speech_array, sampling_rate = sf.read(audio_filepath)
    return speech_array, sampling_rate

def predict_transcription(audio_filepath, processor, model):
    """
    Predict the transcription for an audio file using a pre-trained model.

    Parameters:
    - audio_filepath (str): Path to the audio file for transcription.
    - processor: The processor associated with the Wav2Vec 2.0 model.
    - model: The pre-trained Wav2Vec 2.0 model.

    Returns:
    - transcription (str): The predicted transcription of the audio file.
    """
    # Read and process the audio file
    speech, sampling_rate = speech_file_to_array_fn(audio_filepath)
    inputs = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)

    # Ensure 'attention_mask' is present
    if "attention_mask" not in inputs:
        inputs["attention_mask"] = torch.ones(inputs.input_values.shape, dtype=torch.long)

    # Predict and decode the transcription
    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription[0]

# Main script to load the model and process the dataset
if __name__ == "__main__":
    # Load the pre-trained model and processor
    processor, model = load_pretrained_model_and_processor()

    # Load the dataset from the JSON file
    with open('/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/speech_dataset.json', 'r') as f:
        data = json.load(f)

    # Process each record in the dataset
    for record in data:
        audio_filepath = record['audio_filepath']
        true_transcription = record['transcription']
        predicted_transcription = predict_transcription(audio_filepath, processor, model)

        print(f"True: {true_transcription}")
        print(f"Pred: {predicted_transcription}")
        print("-----")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


True: SOME ARE WONDERFULLY WROUGHT PRETTY LITTLE HOMES FOR BIRDIKINS
Pred: SOME ARE WONDERFULLY WROUGHT PRETTY LITTLE HOMES FOR BURDIKINS
-----
True: INDEED IT IS NOT A NEST AT ALL ONLY THE BEGINNING OF ONE
Pred: INDEED IT IS NOT A NEST AT ALL ONLY THE BEGINNING OF ONE
-----
True: AND THERE IS AN OLD STORY ABOUT THIS WHICH I SHALL TELL YOU
Pred: AND THERE IS AN OLD STORY ABOUT THIS WHICH I SHALL TELL YOU
-----
True: OH WHAT SHALL WE DO FOR A HOME
Pred: OH WHAT SHALL WE DO FOR A HOME
-----
True: AND THE POOR SILLY THINGS RUFFLED UP THEIR FEATHERS AND LOOKED MISERABLE AS ONLY A LITTLE BIRD CAN LOOK WHEN IT IS UNHAPPY
Pred: AND THE POOR SILLY THINGS RUFFLED UP THEIR FEATHERS AND LOOKED MISERABLE AS ONLY A LITTLE BIRD CAN LOOK WHEN IT IS UNHAPPY
-----
True: SHE WAS INDEED A CLEVER BIRD
Pred: SHE WAS INDEED A CLEVER BIRD
-----
True: SHE POPPED INTO HER NEW HOUSE AND SAT THERE COMFORTABLY PEERING OUT THROUGH THE WINDOW SLITS WITH HER SHARP LITTLE EYES
Pred: SHE POPPED INTO HER NEW HOUSE AND 