In [1]:
from pydub import AudioSegment

In [2]:
# Load your audio file
audio = AudioSegment.from_file("./clips/dev/10001.wav")

# Get sampling rate
sampling_rate = audio.frame_rate

# Get bitrate
bitrate = audio.frame_width * audio.frame_rate * 8

print(f"Frame Width: {audio.frame_width} (I think it's in Bytes)")
print(f"Sampling Rate: {sampling_rate/1000} kHz")
print(f"Bitrate: {bitrate/1000} kbps")

Frame Width: 2 (I think it's in Bytes)
Sampling Rate: 44.1 kHz
Bitrate: 705.6 kbps


### After resampling

In [18]:
# Load your audio file
audio = AudioSegment.from_file("./wav_16k/10001.wav")

# Get sampling rate
sampling_rate = audio.frame_rate

# Get bitrate
bitrate = audio.frame_width * audio.frame_rate * 8

print(f"Frame Width: {audio.frame_width} (I think it's in Bytes)")
print(f"Sampling Rate: {sampling_rate/1000} kHz")
print(f"Bitrate: {bitrate/1000} kbps")

Frame Width: 2 (I think it's in Bytes)
Sampling Rate: 16.0 kHz
Bitrate: 256.0 kbps


## LTU-OpenAQA

In [8]:
import json
from tqdm import tqdm

In [4]:
with open("../ltu-openasqa/openasqa_10.3M_v2.json") as f:
    data = json.load(f)

In [25]:
with open("../ltu-openasqa/openasqa_10.3M_v2.thai.json") as f:
    data_thai = json.load(f)

In [5]:
len(data)

10324230

In [10]:
fsd50k_data = []
for x in tqdm(data):
    if 'fsd' in x['dataset']:
        fsd50k_data.append(x['audio_id'].split("/")[-1])

100%|██████████████████████████████████████████████████████████████████████████████| 10324230/10324230 [00:04<00:00, 2304168.22it/s]


In [11]:
len(fsd50k_data)

485385

In [12]:
fsd50k_data = set(fsd50k_data)

In [15]:
len(fsd50k_data)
# this is the same number of examples as those in clips/dev

40966

# Prepare Data

In [20]:
import os

In [21]:
x['audio_id'].split("/")[-1]

'Ses03F_script01_3_F033.wav'

In [19]:
x

{'audio_id': '/data/sls/scratch/yuangong/dataset/IEMOCAP/Session3/sentences/wav/Ses03F_script01_3/Ses03F_script01_3_F033.wav',
 'instruction': 'What is the volume of the speech?',
 'output': 'The volume of the speech is very low.',
 'input': "Because you can't feel that way never again, Chris. You hear me? Never. I mean, the money and me too.",
 'dataset': 'IEMOCAP',
 'task': 'open-ended question'}

In [31]:
fsd50k_examples = []
for x in tqdm(data):
    if 'fsd' in x['dataset']:
        wav_name = x['audio_id'].split("/")[-1]
        path = f"/dataset/audio-data/FSD50k/wav_16k/{wav_name}"
        assert os.path.isfile(path)
        fsd50k_examples.append({
            'path': path,
            'task': "QA",
            'Q': x['instruction'],
            'text': x['output'],
        })

100%|██████████████████████████████████████████████████████████████████████████████| 10324230/10324230 [00:06<00:00, 1525735.00it/s]


In [32]:
len(fsd50k_examples)

485385

In [33]:
fsd50k_examples[9999]

{'path': '/dataset/audio-data/FSD50k/wav_16k/97515.wav',
 'task': 'QA',
 'Q': 'What is the difference between the percussive and metallic sound and the comfortable and warm sound in terms of their acoustic features?',
 'text': 'The percussive and metallic sound has sharp attacks and sustained resonances, while the comfortable and warm sound is characterized by more gentle sustain and lower pitches'}

In [34]:
with open("./train.en.json", "w") as f:
    json.dump(fsd50k_examples, f, indent=4)

In [35]:
fsd50k_examples_thai = []
for x in tqdm(data_thai):
    if 'fsd' in x['dataset']:
        wav_name = x['audio_id'].split("/")[-1]
        path = f"/dataset/audio-data/FSD50k/wav_16k/{wav_name}"
        assert os.path.isfile(path)
        fsd50k_examples_thai.append({
            'path': path,
            'task': "QA",
            'Q': x['instruction'],
            'text': x['output'],
        })

100%|██████████████████████████████████████████████████████████████████████████████| 10324230/10324230 [00:06<00:00, 1557448.58it/s]


In [36]:
with open("./train.th.json", "w") as f:
    json.dump(fsd50k_examples_thai, f, indent=4, ensure_ascii=False)

In [37]:
fsd50k_examples_thai[9999]

{'path': '/dataset/audio-data/FSD50k/wav_16k/97515.wav',
 'task': 'QA',
 'Q': 'อะไรคือความแตกต่างระหว่างเสียงเพอร์คัสชั่นและโลหะกับเสียงที่สบายและอบอุ่นในแง่ของคุณสมบัติทางเสียง?',
 'text': 'เสียงเพอร์คัสชั่นและโลหะมีการโจมตีที่เฉียบคมและมีเสียงสะท้อนที่ยั่งยืน ในขณะที่เสียงที่สบายและอบอุ่นมีเอกลักษณ์เฉพาะด้วยเสียงที่ยาวนานกว่าและความถี่ที่ต่ำกว่า'}

In [38]:
len(fsd50k_examples_thai)

485385