In [3]:
import os
from pydub import AudioSegment
from shutil import copyfile

# Define the base directory where the folders with FLAC files are located
base_dir = 'speech_text/speech_data'
# Define the base directory where the new WAV files and transcripts will be stored
target_base_dir = 'speech_data_wav'

# Create the target directory if it doesn't exist
if not os.path.exists(target_base_dir):
    os.makedirs(target_base_dir)

# Function to convert FLAC to WAV
def convert_flac_to_wav(flac_file_path, wav_file_path):
    try:
        audio = AudioSegment.from_file(flac_file_path, 'flac')
        audio.export(wav_file_path, format='wav')
        print(f"Converted {flac_file_path} to {wav_file_path}")
    except Exception as e:
        print(f"An error occurred while converting {flac_file_path}: {e}")

# Walk through all the subdirectories in the base directory
for subdir, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.flac'):
            flac_file_path = os.path.join(subdir, file)
            new_subdir = subdir.replace(base_dir, target_base_dir)
            if not os.path.exists(new_subdir):
                os.makedirs(new_subdir)
            wav_file_path = os.path.join(new_subdir, file.replace('.flac', '.wav'))
            convert_flac_to_wav(flac_file_path, wav_file_path)
        elif 'trans.txt' in file:
            trans_file_path = os.path.join(subdir, file)
            new_subdir = subdir.replace(base_dir, target_base_dir)
            if not os.path.exists(new_subdir):
                os.makedirs(new_subdir)
            new_trans_file_path = os.path.join(new_subdir, file)
            copyfile(trans_file_path, new_trans_file_path)
            print(f"Copied transcript {trans_file_path} to {new_trans_file_path}")


In [5]:
import os
from shutil import copy2

# Base directory where the folders with FLAC files are located
base_dir = 'speech_data'
# Target directory where all FLAC files will be copied
target_data_dir = 'all_Data'
# Target subdirectory for all transcript files
target_transcripts_dir = os.path.join(target_data_dir, 'all_Transcripts')

# Create the target directories if they don't exist
os.makedirs(target_data_dir, exist_ok=True)
os.makedirs(target_transcripts_dir, exist_ok=True)

# Function to create a unique filename if a file already exists
def unique_filename(file_path):
    base, extension = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{extension}"
        counter += 1
    return file_path

# Iterate over each subfolder and file in the base directory
for subdir, dirs, files in os.walk(base_dir):
    for file in files:
        # Full path to the source file
        file_path = os.path.join(subdir, file)
        # Determine the target directory based on file type
        if file.endswith('.flac'):
            target_dir = target_data_dir
        elif file.endswith('.txt'):
            target_dir = target_transcripts_dir
        else:
            continue  # Skip any non-flac and non-txt files

        # Full path for the target file
        target_file_path = os.path.join(target_dir, file)
        
        # Ensure a unique filename in case of duplicates
        target_file_path = unique_filename(target_file_path)

        # Copy the file to the target directory
        copy2(file_path, target_file_path)
        print(f"Copied {file} to {target_dir}")

print("Finished copying all files.")


Copied 1462-170138-0019.flac to all_Data
Copied 1462-170138-0022.flac to all_Data
Copied 1462-170138-0027.flac to all_Data
Copied 1462-170138-0024.flac to all_Data
Copied 1462-170138-0015.flac to all_Data
Copied 1462-170138-0000.flac to all_Data
Copied 1462-170138-0017.flac to all_Data
Copied 1462-170138-0014.flac to all_Data
Copied 1462-170138-0013.flac to all_Data
Copied 1462-170138-0020.flac to all_Data
Copied 1462-170138-0012.flac to all_Data
Copied 1462-170138-0018.flac to all_Data
Copied 1462-170138-0001.flac to all_Data
Copied 1462-170138-0009.flac to all_Data
Copied 1462-170138-0023.flac to all_Data
Copied 1462-170138-0006.flac to all_Data
Copied 1462-170138-0005.flac to all_Data
Copied 1462-170138-0004.flac to all_Data
Copied 1462-170138-0026.flac to all_Data
Copied 1462-170138.trans.txt to all_Data/all_Transcripts
Copied 1462-170138-0008.flac to all_Data
Copied 1462-170138-0011.flac to all_Data
Copied 1462-170138-0010.flac to all_Data
Copied 1462-170138-0016.flac to all_Data


In [6]:
import os
from pydub import AudioSegment

# Define the directory containing the original FLAC files
source_dir = 'all_Data'
# Define the directory where the WAV files will be saved
target_dir = 'all_Data_wav'

# Create the target directory if it doesn't exist
os.makedirs(target_dir, exist_ok=True)

# Function to convert FLAC to WAV
def convert_flac_to_wav(flac_file_path, wav_file_path):
    # Load the FLAC file
    audio = AudioSegment.from_file(flac_file_path, 'flac')
    # Export as WAV
    audio.export(wav_file_path, format='wav')

# Iterate over all FLAC files in the source directory
for file in os.listdir(source_dir):
    if file.endswith('.flac'):
        # Full path to the source FLAC file
        flac_file_path = os.path.join(source_dir, file)
        # Full path to the target WAV file
        wav_file_path = os.path.join(target_dir, file.replace('.flac', '.wav'))
        # Convert and save the WAV file
        convert_flac_to_wav(flac_file_path, wav_file_path)
        print(f"Converted {file} to WAV format.")

print("Finished converting all FLAC files to WAV format.")


Converted 6295-244435-0023.flac to WAV format.
Converted 2035-147961-0005.flac to WAV format.
Converted 3576-138058-0011.flac to WAV format.
Converted 7850-111771-0006.flac to WAV format.
Converted 2086-149214-0001.flac to WAV format.
Converted 777-126732-0038.flac to WAV format.
Converted 8297-275155-0028.flac to WAV format.
Converted 6295-244435-0027.flac to WAV format.
Converted 2086-149220-0011.flac to WAV format.
Converted 1993-147965-0001.flac to WAV format.
Converted 84-121550-0014.flac to WAV format.
Converted 2803-154328-0016.flac to WAV format.
Converted 6241-61946-0023.flac to WAV format.
Converted 7850-281318-0021.flac to WAV format.
Converted 2078-142845-0045.flac to WAV format.
Converted 2035-152373-0008.flac to WAV format.
Converted 6241-61943-0015.flac to WAV format.
Converted 2086-149220-0048.flac to WAV format.
Converted 2803-161169-0003.flac to WAV format.
Converted 2412-153948-0013.flac to WAV format.
Converted 777-126732-0044.flac to WAV format.
Converted 5338-2844

In [7]:
import json
import os

# Path to the directory where all your transcript files are stored
transcripts_dir = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/all_Transcripts'
# Path to the directory where all your audio files are stored
audio_dir = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav'

# This will be the list of all your data points
data_points = []

# List all transcript files
transcript_files = [f for f in os.listdir(transcripts_dir) if f.endswith('.trans.txt')]

# Process each transcript file
for transcript_file in transcript_files:
    with open(os.path.join(transcripts_dir, transcript_file), 'r') as f:
        lines = f.readlines()
        for line in lines:
            # Split the line into audio file name and transcription
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                # Construct the file name and corresponding text
                audio_filename = parts[0] + '.wav'
                transcription = parts[1]
                audio_filepath = os.path.join(audio_dir, audio_filename)
                
                # Ensure the audio file exists before adding to the dataset
                if os.path.isfile(audio_filepath):
                    data_points.append({
                        'audio_filepath': audio_filepath,
                        'transcription': transcription
                    })

# Path to the output JSON file
output_json_path = '/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/speech_dataset.json'

# Write the dataset to a JSON file
with open(output_json_path, 'w') as json_file:
    json.dump(data_points, json_file, indent=4)

print(f"Created dataset with {len(data_points)} entries.")


Created dataset with 2703 entries.


In [3]:
import torch  # Add this line at the beginning of your imports
import json
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to read in sound file and convert to expected format
def speech_file_to_array_fn(audio_filepath):
    speech_array, sampling_rate = sf.read(audio_filepath)
    return speech_array

# Update the predict_transcription function to handle attention_mask
def predict_transcription(audio_filepath):
    speech = speech_file_to_array_fn(audio_filepath)
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
    
    # Check if 'attention_mask' is in the processed object, if not create it
    if "attention_mask" not in inputs:
        inputs["attention_mask"] = torch.ones(inputs.input_values.shape, dtype=torch.long)

    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription


# Load JSON data
with open('/home/posiden/Documents/GitHub/IDC409/speech_text/all_Data_wav/speech_dataset.json', 'r') as f:
    data = json.load(f)

# Iterate over each record in the JSON file and predict
for record in data:
    audio_filepath = record['audio_filepath']
    true_transcription = record['transcription']
    predicted_transcription = predict_transcription(audio_filepath)[0]

    print(f"True: {true_transcription}")
    print(f"Pred: {predicted_transcription}")
    print("-----")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


True: SOME ARE WONDERFULLY WROUGHT PRETTY LITTLE HOMES FOR BIRDIKINS
Pred: SOME ARE WONDERFULLY WROUGHT PRETTY LITTLE HOMES FOR BURDIKINS
-----
True: INDEED IT IS NOT A NEST AT ALL ONLY THE BEGINNING OF ONE
Pred: INDEED IT IS NOT A NEST AT ALL ONLY THE BEGINNING OF ONE
-----
True: AND THERE IS AN OLD STORY ABOUT THIS WHICH I SHALL TELL YOU
Pred: AND THERE IS AN OLD STORY ABOUT THIS WHICH I SHALL TELL YOU
-----
True: OH WHAT SHALL WE DO FOR A HOME
Pred: OH WHAT SHALL WE DO FOR A HOME
-----
True: AND THE POOR SILLY THINGS RUFFLED UP THEIR FEATHERS AND LOOKED MISERABLE AS ONLY A LITTLE BIRD CAN LOOK WHEN IT IS UNHAPPY
Pred: AND THE POOR SILLY THINGS RUFFLED UP THEIR FEATHERS AND LOOKED MISERABLE AS ONLY A LITTLE BIRD CAN LOOK WHEN IT IS UNHAPPY
-----
True: SHE WAS INDEED A CLEVER BIRD
Pred: SHE WAS INDEED A CLEVER BIRD
-----
True: SHE POPPED INTO HER NEW HOUSE AND SAT THERE COMFORTABLY PEERING OUT THROUGH THE WINDOW SLITS WITH HER SHARP LITTLE EYES
Pred: SHE POPPED INTO HER NEW HOUSE AND 