# MonsterAPI Audio Data Processing Notebook

An IPYNB notebook for converting a zip file with audio files and metadata containing the transcription to a paraquet dataset and uploading it to HuggingFace Hub. This makes it compatible to be used with the MonsterAPI Whisper Finetuner


## Prerequisites

- A Google Colab environment or a Jupyter notebook setup with internet access.
- A ZIP file containing audio data you wish to process. ( format is specified below)

## Zip format

- Create a zip file manually containing the audiofiles and also a CSV file named "metadata.csv" containing 2 columns "file_name" and "sentence".
- The column "file_name" contains the file names of the audio files. ( relative to the csv location)
- The column "sentence" contains the respective transcriptions of the audiofiles


## Usage Instructions

1. **Zip File Creation**: Create the zip file manually in the format specified above
2. **Run the cell below and configure inputs**: Upload your zip file and configure the inputs . Make sure to use huggingface token with write access.
3. **Click the process inputs button**: this loads the zip , converts it into a paraquet file and uploads it to your huggingface account.


In [None]:
# @title RUN ME!
!pip install -q soundfile datasets
import datasets
import tempfile
import zipfile
import os

import io
from google.colab import files
from ipywidgets import widgets, Button, VBox
from IPython.display import display, clear_output


def extract_zip_to_temp(zip_path):
    temp_dir = tempfile.mkdtemp()

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    validate_data_folder(temp_dir)

    return temp_dir


def validate_data_folder(folder_path):
    # check if metadata.csv exists
    metadata_path = f'{folder_path}/metadata.csv'
    if not os.path.exists(metadata_path):
        raise ValueError(f'metadata.csv does not exist. the zip/folder should contain a file called "metadata.csv" with the metadata of the audio files.')
    # check if there is a column called 'file_name' and 'sentence' in metadata.csv
    metadata = datasets.load_dataset('csv', data_files=metadata_path, split='train')
    print(metadata)
    if 'file_name' not in metadata.column_names:
        raise ValueError(f'metadata.csv needs to have a column called "file_name", which lists the file names of the audio files in the folder.')
    if 'sentence' not in metadata.column_names:
        raise ValueError(f'metadata.csv needs to have a column called "sentence", which lists the transcription of the audio files in the folder.')
    # check if all the files listed in file_name column exist in the folder
    for file_name in metadata['file_name']:
        if not os.path.exists(f'{folder_path}/{file_name}'):
            raise ValueError(f'{file_name} does not exist in {folder_path}. Make sure all the files listed in "file_name" column in metadata.csv exist in the folder.')

    return True

zip_path = None
hf_token = ''
upload_as_private = True

# Upload button for zip file
upload_button = widgets.FileUpload(
    accept='.zip',  # Specify that only .zip files can be accepted
    multiple=False  # Allow only one file to be uploaded
)

# Text box for hf_token
hf_token_textbox = widgets.Text(
    description='HF Token:',
    value='your_hf_write_key'  # Default value, can be edited by the user
)

# Checkbox for upload_as_private
upload_as_private_checkbox = widgets.Checkbox(
    value=True,
    description='Upload as private',
    style={'description_width': 'initial'}
)

# Button to confirm the input and process the data
process_button = Button(description="Process Inputs")

def main(b):

    #process inputs
    if upload_button.value:
        uploaded_filename = next(iter(upload_button.value))
        zip_path = f"/content/{uploaded_filename}"
        with open(zip_path, "wb") as f:
            f.write(upload_button.value[uploaded_filename]['content'])
        print(f"File {uploaded_filename} uploaded and saved as {zip_path}.")
    else:
        print("No file uploaded.")
        return 1

    hf_token = hf_token_textbox.value
    upload_as_private = upload_as_private_checkbox.value

    # extract and upload ds to hub
    data_dir = extract_zip_to_temp(zip_path)
    ds = datasets.load_dataset('audiofolder',
                              data_dir=data_dir)

    dataset_name = zip_path.split('.')[0].split('/')[-1]

    ds.push_to_hub(dataset_name,
                  private=upload_as_private,
                  token=hf_token)

    print('Your dataset has been processed and uploaded to your huggingface account')

process_button.on_click(main)

display(VBox([upload_button, hf_token_textbox, upload_as_private_checkbox, process_button]))

VBox(children=(FileUpload(value={}, accept='.zip', description='Upload'), Text(value='your_hf_write_key', desc…