Skip to content

Commit

Permalink
Merge pull request #10 from sensein/dev
Browse files Browse the repository at this point in the history
speech to text works fine
  • Loading branch information
fabiocat93 committed May 9, 2024
2 parents 26b3b56 + 4633075 commit abfcd19
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 79 deletions.
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,25 @@ hello_world()
## Contributing
Please see [CONTRIBUTING.md](CONTRIBUTING.md) before contributing.

## To do:
## To do (majors):
- [ ] Integrating more multi-modal tasks
- [x] input_output
- [ ] Integrating more audio tasks and moving functions from b2aiprep package:
- [ ] data_augmentation
- [ ] data_representation
- [x] example_task
- [x] input_output
- [ ] data_representation (embeddings and feats extraction)
- [ ] raw_signal_processing
- [ ] speaker_diarization
- [ ] speech emotion recognition
- [ ] speaker_diarization [@Isaac]
- [ ] speech emotion recognition [@Jordan]
- [ ] speech enhancement
- [ ] speech_to_text
- [x] speech_to_text
- [ ] text_to_speech
- [ ] voice conversion
- [ ] Integrating more video tasks:
- [x] input_output
- [ ] input_output

- [ ] Preparing some pipelines with pydra
- [ ] Populating the CLI

## To do (minors):
- [ ] Check login with HF
- [ ] Setup project cache directory
154 changes: 133 additions & 21 deletions scripts/experiment1.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

import pydra

from senselab.audio.tasks.speech_to_text import transcribe_dataset
from senselab.audio.tasks.speech_to_text_pydra import transcribe_dataset_pt
from senselab.audio.tasks.preprocessing import resample_hf_dataset
from senselab.audio.tasks.preprocessing_pydra import resample_hf_dataset_pt
from senselab.audio.tasks.speech_to_text import transcribe_dataset_with_hf
from senselab.audio.tasks.speech_to_text_pydra import transcribe_dataset_with_hf_pt
from senselab.utils.decorators import get_response_time
from senselab.utils.tasks.input_output import push_dataset_to_hub, read_files_from_disk
from senselab.utils.tasks.input_output_pydra import push_dataset_to_hub_pt, read_files_from_disk_pt
Expand All @@ -18,12 +20,16 @@ def workflow(data: Dict[str, Any]) -> None:
dataset = read_files_from_disk(data["files"])
print(f"Dataset loaded with {len(dataset)} records.")

print("Resampling dataset...")
dataset = resample_hf_dataset(dataset, 16000)
print("Resampled dataset.")

print("Pushing dataset to the hub...")
push_dataset_to_hub(dataset, remote_repository="fabiocat/test", split="train")
print("Dataset pushed to the hub successfully.")

print("Transcribing dataset...")
transcript_dataset = transcribe_dataset(dataset, "openai/whisper-tiny")
transcript_dataset = transcribe_dataset_with_hf(dataset=dataset, model_id="openai/whisper-tiny", language="en") # facebook/wav2vec2-base-960h
print("Transcribed dataset.")

print("Pushing dataset to the hub...")
Expand All @@ -35,32 +41,138 @@ def pydra_workflow(data: Dict[str, Any]) -> None:
"""This function reads audio files from disk, and transcribes them using Whisper."""
wf0 = pydra.Workflow(name='wf0', input_spec=['x'], x=data['files'])
wf0.add(read_files_from_disk_pt(name='read_files_from_disk_name', files=wf0.lzin.x))
wf0.add(push_dataset_to_hub_pt(name='push_audio_dataset_to_hub_name', dataset=wf0.read_files_from_disk_name.lzout.out, remote_repository="fabiocat/test", split="train"))
wf0.add(resample_hf_dataset_pt(name='resample_hf_dataset_name', dataset=wf0.read_files_from_disk_name.lzout.out, resample_rate=16000))
wf0.add(push_dataset_to_hub_pt(name='push_audio_dataset_to_hub_name', dataset=wf0.resample_hf_dataset_name.lzout.out, remote_repository="fabiocat/test", split="train"))

wf0.add(transcribe_dataset_pt(name='transcribe_dataset_name', dataset=wf0.read_files_from_disk_name.lzout.out, model_id="openai/whisper-tiny"))
wf0.add(transcribe_dataset_with_hf_pt(name='transcribe_dataset_name', dataset=wf0.resample_hf_dataset_name.lzout.out, model_id="openai/whisper-tiny", language="en"))
wf0.add(push_dataset_to_hub_pt(name='push_transcript_dataset_to_hub_name', dataset=wf0.transcribe_dataset_name.lzout.out, remote_repository="fabiocat/transcript"))

wf0.set_output([('out', wf0.read_files_from_disk_name.lzout.out)])
wf0.set_output([('out', wf0.transcribe_dataset_name.lzout.out)])

# PYDRA RUN
with pydra.Submitter(plugin='serial') as sub:
sub(wf0)

_ = wf0.result()


data = {"files": ["/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav"], "type": "audio"}
_ = wf0.result()

data = {"files":
["/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
"/Users/fabiocat/Documents/git/pp/senselab/src/tests/data_for_testing/audio_48khz_mono_16bits.wav",
]
}

workflow(data)
print("\n\n")
pydra_workflow(data)


# TODO:
# CHANGE NAME TO THE PACKAGE
# PUBLISH THE NEW PACKAGE

# CHECK INPUTS AND OUTPUTS!!
# TODO: SPEECH TO TEXT ON MULTIPLE FILES

# CHECK BETTER LOGIN WITH HF
# SETUP CACHE WITH HF
31 changes: 31 additions & 0 deletions src/senselab/audio/tasks/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""This module implements some utilities for the preprocessing task."""
from typing import Any, Dict

import torch
import torchaudio.functional as F
from datasets import Dataset

from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict


def resample_hf_dataset(dataset: Dict[str, Any], resample_rate: int, rolloff: float = 0.99) -> Dict[str, Any]:
"""Resamples a Hugging Face `Dataset` object."""
hf_dataset = _from_dict_to_hf_dataset(dataset)

def _resample_hf_row(row: Dataset, resample_rate: int, rolloff: float = 0.99) -> Dict[str, Any]:
"""Resamples audio data in a hf dataset row.
A lower rolloff will therefore reduce the amount of aliasing, but it will also reduce some of the higher frequencies.
"""
waveform = row['audio']['array']
# Ensure waveform is a PyTorch tensor
if not isinstance(waveform, torch.Tensor):
waveform = torch.tensor(waveform)
sampling_rate = row['audio']['sampling_rate']

resampled_waveform = F.resample(waveform, sampling_rate, resample_rate, rolloff=rolloff)

return {'audio': {'array': resampled_waveform, 'sampling_rate': resample_rate}}

resampled_hf_dataset = hf_dataset.map(lambda x: _resample_hf_row(x, resample_rate, rolloff))
return _from_hf_dataset_to_dict(resampled_hf_dataset)
6 changes: 6 additions & 0 deletions src/senselab/audio/tasks/preprocessing_pydra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""This module defines a pydra API for the preprocessing task."""
import pydra

from senselab.audio.tasks.preprocessing import resample_hf_dataset

resample_hf_dataset_pt = pydra.mark.task(resample_hf_dataset)
Loading

0 comments on commit abfcd19

Please sign in to comment.