In [1]:
import torch
import torchaudio
from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
mapper = SpeechToEmbeddingModelPipeline(encoder="sonar_speech_encoder_deu", device=device)
# mapper.to(device);    

In [3]:
audio, sr = torchaudio.load("/data/tagesschau/audio/1/audio.wav")
audio = audio[:1, :]
audio_low = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=16_000)
audio.shape[-1] / sr, audio_low.shape, audio_low.shape[-1]/16_000

(991.2250416666667, torch.Size([1, 15859601]), 991.2250625)

In [10]:
padded_audio = torch.nn.functional.pad(audio, (0, (48000_000 - audio.shape[-1])))
padded_audio_low = torch.nn.functional.pad(audio_low, (0, (16000_000 - audio_low.shape[-1])))

padded_audio.shape, padded_audio_low.shape

(torch.Size([2, 48000000]), torch.Size([1, 16000000]))

In [11]:
concat = padded_audio.split(48000_0, dim=-1)
concat_low = padded_audio_low.split(16000_0, dim=-1)

In [12]:
len(concat), len(concat_low)

(100, 100)

In [13]:
with torch.no_grad():
    out = mapper.predict(concat_low, batch_size=16)

In [17]:
concat_low[0].shape, out.shape

(torch.Size([1, 160000]), torch.Size([100, 1024]))

In [3]:
from pathlib import Path
import json
from tqdm import tqdm
import os

In [4]:
# files = Path("/data/tagesschau/more_audio/").glob("**/*.wav")
exist_10 = True
exist_30 = True

files = json.load(open("/data/tagesschau/more_data/labels.json", "r")).keys()
prog = tqdm(files)
for file in prog:
    
    if not os.path.exists(f"/data/tagesschau/more_data_preprocessed/10sec/{file}.pt"):
        exist_10 = False
    if not os.path.exists(f"/data/tagesschau/more_data_preprocessed/30sec/{file}.pt"):
        exist_30 = False
    if exist_10 and exist_30:
        continue

    audio, sr = torchaudio.load(f"/data/tagesschau/more_data/{file}/audio.wav")
    audio_low = torchaudio.functional.resample(audio, orig_freq=sr, new_freq=16000)
    
    audio_low = audio_low[:1, :16_000_000]
    if audio_low.shape[-1]<16_000_000:
        pad = 16_000_000 - audio_low.shape[-1]
        audio_low = torch.nn.functional.pad(audio_low, (0, pad), mode='constant')
    
    prog.set_postfix({"audio_len": audio_low.shape})

    audio_10_splits = list(torch.split(audio_low, split_size_or_sections=(16_000 * 10), dim=-1))
    audio_30_splits = list(torch.split(audio_low, split_size_or_sections=(16_000 * 30), dim=-1))

    with torch.no_grad():
        output_10 = mapper.predict(audio_10_splits, batch_size=16)
        output_30 = mapper.predict(audio_30_splits, batch_size=16)
    output_10 = torch.nan_to_num(output_10, nan=0.0)
    output_30 = torch.nan_to_num(output_30, nan=0.0)

    prog.set_postfix({"10_sec_shape": output_10.shape, "30_sec_shape": output_30.shape})
    
    torch.save(output_30, Path("/data/tagesschau/more_data_preprocessed/30sec/") / f"{file}.pt")
    torch.save(output_10, Path("/data/tagesschau/more_data_preprocessed/10sec/") / f"{file}.pt")
    
    exist_10 = True
    exist_30 = True,

  0%|          | 0/927 [00:00<?, ?it/s]

100%|██████████| 927/927 [35:33<00:00,  2.30s/it, 10_sec_shape=torch.Size([100, 1024]), 30_sec_shape=torch.Size([34, 1024])]


In [26]:
output_10[0]

tensor([-0.0026,  0.0030,  0.0062,  ...,  0.0084,  0.0103, -0.0066],
       device='cuda:0')