In [8]:
from tqdm import tqdm
import json
import os

In [2]:
from datasets import load_dataset
dataset = load_dataset("d0rj/audiocaps")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 49838
    })
    validation: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 2475
    })
    test: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 4875
    })
})

In [4]:
train = dataset['train']
validation = dataset['validation']
test = dataset['test']

In [5]:
print(len(train))
print(len(validation))
print(len(test))

49838
2475
4875


In [6]:
train[0]

{'audiocap_id': 91139,
 'youtube_id': 'r1nicOVtvkQ',
 'start_time': 130,
 'caption': 'A woman talks nearby as water pours'}

In [7]:
validation[0]

{'audiocap_id': 97151,
 'youtube_id': 'vfY_TJq7n_U',
 'start_time': 130,
 'caption': 'Rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown'}

In [9]:
dataset = []
for x in tqdm(train):
    path = f"/dataset/audio-data/audiocaps/train/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],
        'path': path,
        'text': x['caption'],
        
    })
print(len(dataset))
with open("./train.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49838/49838 [00:01<00:00, 29340.12it/s]


48267


In [11]:
dataset = []
for x in tqdm(validation):
    path = f"/dataset/audio-data/audiocaps/val/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text': x['caption'],
    })
print(len(dataset))
with open("./val.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2475/2475 [00:00<00:00, 24743.18it/s]

2335





In [12]:
dataset = []
for x in tqdm(test):
    path = f"/dataset/audio-data/audiocaps/test/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text': x['caption'],
    })
print(len(dataset))
with open("./test.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4875/4875 [00:00<00:00, 29305.87it/s]

4690





# Sanity Check

In [24]:
from pydub import AudioSegment

In [25]:
# Load your audio file
audio = AudioSegment.from_file("./train/-02sZMlMYIk.wav")

# Get sampling rate
sampling_rate = audio.frame_rate

# Get bitrate
bitrate = audio.frame_width * audio.frame_rate * 8

print(f"Frame Width: {audio.frame_width}")
print(f"Sampling Rate: {sampling_rate/1000} kHz")
print(f"Bitrate: {bitrate/1000} kbps")

Frame Width: 4
Sampling Rate: 16.0 kHz
Bitrate: 512.0 kbps


# AudioCap translated (Thai)

In [14]:
import json

In [15]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [16]:
paths = [
    "/workspace/translate-data/outputs/audiocaps/train/0_16000.jsonl",
    "/workspace/translate-data/outputs/audiocaps/train/16000_32000.jsonl",
    "/workspace/translate-data/outputs/audiocaps/train/32000_49838.jsonl",
]
train_th = []
for path in paths:
    train_th += read_jsonl(path)
print(len(train_th))

dataset = []
for i, x in tqdm(enumerate(train)):
    path = f"/dataset/audio-data/audiocaps/train/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    
    assert train_th[i]['youtube_id'] == x['youtube_id']
    assert train_th[i]['audiocap_id'] == x['audiocap_id']


    text = train_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text': text,
        
    })
print(len(dataset))
with open("./train_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

49838


49838it [00:01, 27936.87it/s]


48267


In [17]:
val_th = read_jsonl("/workspace/translate-data/outputs/audiocaps/val/0_2475.jsonl")
print(len(val_th))


dataset = []
for i, x in tqdm(enumerate(validation)):
    path = f"/dataset/audio-data/audiocaps/val/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    assert val_th[i]['youtube_id'] == x['youtube_id']
    assert val_th[i]['audiocap_id'] == x['audiocap_id']


    text = val_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text': text,
        
    })
print(len(dataset))
with open("./val_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

2475


2475it [00:00, 27935.61it/s]

2335





In [18]:
test_th = read_jsonl("/workspace/translate-data/outputs/audiocaps/test/0_4875.jsonl")
print(len(test_th))


dataset = []
for i, x in tqdm(enumerate(test)):
    path = f"/dataset/audio-data/audiocaps/test/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    assert test_th[i]['youtube_id'] == x['youtube_id']
    assert test_th[i]['audiocap_id'] == x['audiocap_id']


    text = test_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text': text,
        
    })
print(len(dataset))
with open("./test_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

4875


4875it [00:00, 28503.09it/s]

4690





# Subset of Test set

In [20]:
import random

In [25]:
test_th = read_jsonl("/workspace/translate-data/outputs/audiocaps/test/0_4875.jsonl")
print(len(test_th))
dataset = []
for i, x in tqdm(enumerate(test)):
    path = f"/dataset/audio-data/audiocaps/test/{x['youtube_id']}.wav"
    if os.path.isfile(path) == False:
        continue
    assert test_th[i]['youtube_id'] == x['youtube_id']
    assert test_th[i]['audiocap_id'] == x['audiocap_id']


    text = test_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': path,
        'text_th': text,
        'text_en': x['caption'],
        
    })
print(len(dataset))

4875


4875it [00:00, 28607.65it/s]

4690





In [26]:
random.shuffle(dataset)
test_1k = dataset[:1000]

In [27]:
len(test_1k)

1000

In [29]:
test_1k

[{'path': '/dataset/audio-data/audiocaps/test/-EaZ7EJJUl0.wav',
  'text_th': 'มีคนพูดพร้อมเสียงฮัมเบาๆ และเสียงกริ๊งใกล้ตัว',
  'text_en': 'A person speaks with distant humming and nearby clinking'},
 {'path': '/dataset/audio-data/audiocaps/test/j1AiqT5oHZc.wav',
  'text_th': 'ผู้ชายที่เป็นผู้ใหญ่พูดอย่างลังเล และเสียงบี๊บแบบอิเล็กทรอนิกส์ก็เกิดขึ้นโดยสุ่ม',
  'text_en': 'An adult male speaks hesitantly, and electronic beeps randomly occur'},
 {'path': '/dataset/audio-data/audiocaps/test/-mhFGevxLUg.wav',
  'text_th': 'ผู้ชายพูดพร้อมเสียงพึมพำในเบื้องหลัง',
  'text_en': 'A man speaking with murmuring in the background'},
 {'path': '/dataset/audio-data/audiocaps/test/CvNAwby6Xos.wav',
  'text_th': 'ผู้ชายพูดเหมือนจักรเย็บผ้าทำงานอย่างรวดเร็วและฮัมเพลง',
  'text_en': 'A man talking as a sewing machine rapidly operates and hums'},
 {'path': '/dataset/audio-data/audiocaps/test/3kBlVLkN0zo.wav',
  'text_th': 'ลมพัดและผู้คนพูดก่อนแพะส่งเสียงร้องทั้งใกล้และไกล โดยมีนกที่อยู่ห่างไกลส่งเสียงร้อ

In [30]:
with open("./test_1000_enth.json", "w") as f:
    json.dump(test_1k, f, indent=4, ensure_ascii=False)