In [1]:
from tqdm import tqdm
import json

In [2]:
from datasets import load_dataset
dataset = load_dataset("d0rj/audiocaps")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 49838
    })
    validation: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 2475
    })
    test: Dataset({
        features: ['audiocap_id', 'youtube_id', 'start_time', 'caption'],
        num_rows: 4875
    })
})

In [4]:
train = dataset['train']
validation = dataset['validation']
test = dataset['test']

In [5]:
print(len(train))
print(len(validation))
print(len(test))

49838
2475
4875


In [6]:
train[0]

{'audiocap_id': 91139,
 'youtube_id': 'r1nicOVtvkQ',
 'start_time': 130,
 'caption': 'A woman talks nearby as water pours'}

In [7]:
validation[0]

{'audiocap_id': 97151,
 'youtube_id': 'vfY_TJq7n_U',
 'start_time': 130,
 'caption': 'Rustling occurs, ducks quack and water splashes, followed by an adult female and adult male speaking and duck calls being blown'}

In [10]:
dataset = []
for x in tqdm(train):
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/train/{x['youtube_id']}.wav",
        'text': x['caption'],
        
    })
with open("./train.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49838/49838 [00:01<00:00, 32493.38it/s]


In [11]:
dataset = []
for x in tqdm(validation):
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/val/{x['youtube_id']}.wav",
        'text': x['caption'],
    })
with open("./val.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2475/2475 [00:00<00:00, 34397.65it/s]


In [12]:
dataset = []
for x in tqdm(test):
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/test/{x['youtube_id']}.wav",
        'text': x['caption'],
    })
with open("./test.json", "w") as f:
    json.dump(dataset, f, indent=4)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4875/4875 [00:00<00:00, 34660.03it/s]


# Sanity Check

In [24]:
from pydub import AudioSegment

In [25]:
# Load your audio file
audio = AudioSegment.from_file("./train/-02sZMlMYIk.wav")

# Get sampling rate
sampling_rate = audio.frame_rate

# Get bitrate
bitrate = audio.frame_width * audio.frame_rate * 8

print(f"Frame Width: {audio.frame_width}")
print(f"Sampling Rate: {sampling_rate/1000} kHz")
print(f"Bitrate: {bitrate/1000} kbps")

Frame Width: 4
Sampling Rate: 16.0 kHz
Bitrate: 512.0 kbps


# AudioCap translated (Thai)

In [14]:
import json

In [15]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

In [24]:
paths = [
    "/workspace/translate-data/outputs/audiocaps/train/0_16000.jsonl",
    "/workspace/translate-data/outputs/audiocaps/train/16000_32000.jsonl",
    "/workspace/translate-data/outputs/audiocaps/train/32000_49838.jsonl",
]
train_th = []
for path in paths:
    train_th += read_jsonl(path)
print(len(train_th))


dataset = []
for i, x in tqdm(enumerate(train)):
    assert train_th[i]['youtube_id'] == x['youtube_id']
    assert train_th[i]['audiocap_id'] == x['audiocap_id']


    text = train_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/train/{x['youtube_id']}.wav",
        'text': text,
        
    })
with open("./train_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

49838


49838it [00:01, 32455.66it/s]


In [27]:
val_th = read_jsonl("/workspace/translate-data/outputs/audiocaps/val/0_2475.jsonl")
print(len(val_th))


dataset = []
for i, x in tqdm(enumerate(validation)):
    assert val_th[i]['youtube_id'] == x['youtube_id']
    assert val_th[i]['audiocap_id'] == x['audiocap_id']


    text = val_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/val/{x['youtube_id']}.wav",
        'text': text,
        
    })
with open("./val_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

2475


2475it [00:00, 32965.50it/s]


In [28]:
test_th = read_jsonl("/workspace/translate-data/outputs/audiocaps/test/0_4875.jsonl")
print(len(test_th))


dataset = []
for i, x in tqdm(enumerate(test)):
    assert test_th[i]['youtube_id'] == x['youtube_id']
    assert test_th[i]['audiocap_id'] == x['audiocap_id']


    text = test_th[i]['caption_th'].split("<|im_start|>")[0] 
    
    dataset.append({
        # 'audiocap_id': x['audiocap_id'],
        # 'youtube_id': x['youtube_id'],
        # 'start_time': x['start_time'],
        # 'caption': x['caption'],

        'path': f"/dataset/audio-data/audiocaps/test/{x['youtube_id']}.wav",
        'text': text,
        
    })
with open("./test_th.json", "w") as f:
    json.dump(dataset, f, indent=4, ensure_ascii=False)

4875


4875it [00:00, 33247.53it/s]
