In [1]:
from __future__ import annotations

import pathlib
from typing import Optional, Any

import transformers
import wandb
import torch
from torch.utils.data import DataLoader
import typer
import yaml
import torchdata.datapipes as dp
import json
import time
from tqdm import tqdm

import audiocap.metrics
import audiocap.data
import audiocap.callbacks
import audiocap.models

ModuleNotFoundError: No module named 'audiocap'

In [5]:
import sys
sys.executable
!/storage/plzen1/home/ahajek/miniconda3/envs/malach23/bin/python -m pip show auidocap

[0m

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
def get_whisper_model(
    config_name: str,
    config: transformers.WhisperConfig,
    load_checkpoint: pathlib.Path | None,
    use_pretrained_whisper_encoder: bool,
    use_pretrained_whisper_decoder: bool,
) -> audiocap.WhisperForAudioCaptioning:
    
    if load_checkpoint is not None:
        model = audiocap.WhisperForAudioCaptioning.from_pretrained(load_checkpoint)
        assert isinstance(model, audiocap.WhisperForAudioCaptioning)
        return model

    if use_pretrained_whisper_encoder and use_pretrained_whisper_decoder:
        model = audiocap.WhisperForAudioCaptioning.from_pretrained(config_name)
        assert isinstance(model, audiocap.WhisperForAudioCaptioning)
        return model
    
    if not use_pretrained_whisper_encoder and not use_pretrained_whisper_decoder:
        return audiocap.WhisperForAudioCaptioning(config)
    
    model_pretrained = audiocap.WhisperForAudioCaptioning.from_pretrained(config_name)
    assert isinstance(model_pretrained, audiocap.WhisperForAudioCaptioning)
    model = audiocap.WhisperForAudioCaptioning(config)

    if use_pretrained_whisper_encoder:
        model.model.encoder = model_pretrained.get_encoder()

    if use_pretrained_whisper_decoder:
        model.model.decoder = model_pretrained.get_decoder()
    
    del model_pretrained
    return model

In [None]:
import os
assert torch.cuda.is_available()
!nvidia-smi

Wed May  3 19:01:22 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 5000                 On | 00000000:C1:00.0 Off |                  Off |
| 33%   26C    P8               13W / 230W|   6327MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
torch.cuda.device_count()

1

In [None]:
for i in range(torch.cuda.device_count()):
    print(i, torch.cuda.get_device_properties(i))

0 _CudaDeviceProperties(name='Quadro RTX 5000', major=7, minor=5, total_memory=16117MB, multi_processor_count=48)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tensor([1, 2, 3], device='cuda:0')

In [None]:
ntb_dir = pathlib.Path("/storage/plzen1/home/ahajek/10semester/audio_challenge/audio-captioning/audiocap/")
load_checkpoint = ntb_dir / pathlib.Path("../checkpoints/stoic-totem-29/checkpoint-18900")
dataset_dir = ntb_dir / pathlib.Path("../data/clotho_v2.1/audiofolder/")
output_dir = ntb_dir / pathlib.Path("../inference_outputs")
generate_config = ntb_dir / pathlib.Path("../configs/generate_clotho_test.yaml")


In [None]:
with open(generate_config, "r") as f:
    generate_config_dict: dict = yaml.safe_load(f)

In [None]:
generate_args_dict = generate_config_dict["generate_args"]
architecture_config = generate_config_dict["architecture"]
architecture_name = architecture_config["name"]
use_pretrained_encoder = architecture_config["use_pretrained_whisper_encoder"]
use_pretrained_decoder = architecture_config["use_pretrained_whisper_decoder"]

only_allowed_tokens = generate_config_dict["only_allowed_tokens"]
batch_size = generate_config_dict["batch_size"]

data_config = generate_config_dict["data"]
dataset_name = data_config["dataset_name"]
task = data_config["task"]
dataset_type = data_config["dataset_type"]
data_limit = data_config["data_limit"]
run_name = load_checkpoint.parent.name  

print("run name: ", run_name)
print("dataset_path: ", dataset_dir)

run name:  stoic-totem-29
dataset_path:  /storage/plzen1/home/ahajek/10semester/audio_challenge/audio-captioning/audiocap/../data/clotho_v2.1/audiofolder


In [None]:
config = transformers.WhisperConfig.from_pretrained(architecture_name)
tokenizer = transformers.WhisperTokenizer.from_pretrained(architecture_name, language="en", task="transcribe")
feature_extractor = transformers.WhisperFeatureExtractor.from_pretrained(architecture_name)
collator = audiocap.data.DataCollatorAudioSeq2SeqWithPadding(tokenizer, feature_extractor, keep_filename=True)
assert isinstance(config, transformers.WhisperConfig)
model = get_whisper_model(architecture_name, config, load_checkpoint, use_pretrained_encoder, use_pretrained_decoder)

tuned_params = sum(p.shape.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.shape.numel() for p in model.parameters())
print(f"Number of trained parameters: {tuned_params}/{total_params} = {tuned_params/total_params*100:.2f}%")

Number of trained parameters: 1543304960/1543304960 = 100.00%


In [None]:
model.to(device)

WhisperForAudioCaptioning(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
   

In [None]:
# set eval dataset
audioset_dir = None
audiocaps_dir = None
clotho_dir = None

if dataset_name == "clotho":
    ds = audiocap.data.load_clotho(dataset_dir, tokenizer, feature_extractor, None, 0, 0, 0)
elif dataset_name == "audioset":
    ds = audiocap.data.load_audioset(dataset_dir, tokenizer, feature_extractor, None, 0, 0, 0)
elif dataset_name == "audiocaps":
    ds = audiocap.data.load_audiocaps(dataset_dir, tokenizer, feature_extractor, None, 0, 0, 0)
else:
    raise ValueError(f"Unknown dataset name: {dataset_name}")

In [None]:
# generate captions (batched)
dataloader = DataLoader(ds[dataset_type].pipe, batch_size=batch_size, collate_fn=collator, num_workers=4)
len(dataloader)

In [31]:
# generate captions (batched)
dataloader = DataLoader(ds[dataset_type].pipe, batch_size=batch_size, collate_fn=collator, num_workers=4)
all_predictions = []
for i, batch in enumerate(dataloader):
    if data_limit and i >= data_limit/batch_size:
        break
    filenames = batch.pop("file_name")
    prediction_ids = model.generate(batch["input_features"].to(device), 
                                    forced_ac_decoder_ids=batch["forced_ac_decoder_ids"].to(device), 
                                    **generate_args_dict)
    predictions = tokenizer.batch_decode(prediction_ids, skip_special_tokens=False)
    predictions_dict = [{"file_name": fn, "caption": p} for fn, p in zip(filenames, predictions)]
    all_predictions.extend([{"file_name": fn, "caption": p} for fn, p in zip(filenames, predictions)])
    [print(p) for p in predictions_dict]

{'file_name': '20070720.rbd.chicharras.02.wav', 'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A motor vehicle engine is running and vibrating, and people are talking in the background<|endoftext|>'}
{'file_name': 'Appartment_Ambient_AC_TV_Fans.wav', 'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A boat motor is running and fades<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'}
{'file_name': 'kikkers.wav', 'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Wind blows and an animal breathes<|endoftext|><|endoftext|>'}
{'file_name': 'Rushing_water+wind-Rec_Samsung_HMX-F80_Camcorder.wav', 'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Water is running and gurgling<|endoftext|>'}
{'file_name': 'Outdoors, Cars pass by

In [23]:
all_predictions

[{'file_name': 'People Working - Inner Courtyard - Church Bells.wav',
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Music plays nearby as a person sprays on a surface<|endoftext|>'},
 {'file_name': 'Air door sound.wav',
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Wind blows and a vehicle approaches<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'},
 {'file_name': 'Rainy weather.wav',
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A large motor vehicle engine is running and vibrating<|endoftext|>'},
 {'file_name': 'Trsat_Afternoon_--.wav',
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A bus engine hums<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'},
 {'file_name': 'fireworks.wav',
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption:

In [24]:
# generate captions (filename)
all_predictions = []
for i, batch in tqdm(enumerate(DataLoader(ds[dataset_type].pipe, batch_size=1))):
    if data_limit and i >= data_limit:
        break

    batch_features = [{"input_features": x["input_features"]} for x in [batch]]
    batch_forced_ac_decoder_ids = [x["forced_ac_decoder_ids"] for x in [batch]]

    col_batch = feature_extractor.pad(batch_features, return_tensors="pt")
    col_batch["forced_ac_decoder_ids"] = torch.tensor(batch_forced_ac_decoder_ids)

    prediction_ids = model.generate(col_batch["input_features"][0].to(device), 
                                    forced_ac_decoder_ids=col_batch["forced_ac_decoder_ids"].to(device), 
                                    **generate_args_dict)
    prediction = tokenizer.batch_decode(prediction_ids, skip_special_tokens=False)[0]
    all_predictions.append({"file_name": batch["file_name"], "caption": prediction})
    print(batch["file_name"], prediction)

['People Working - Inner Courtyard - Church Bells.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Music plays nearby as a person sprays on a surface<|endoftext|>
['Rainy weather.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A large motor vehicle engine is running and vibrating<|endoftext|>
['fireworks.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Several explosions and fireworks with some whistling and humming<|endoftext|>
['TREADMILL_GYM-AMB.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A machine is spraying a surface<|endoftext|>
['Air door sound.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Wind blows and a vehicle approaches<|endoftext|>
['Trsat_Afternoon_--.wav'] <|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A bus engine hums<|endoftext|>
['Cup_spoon_hotwater.wav'] <|startoftranscri

In [54]:
all_predictions

[{'file_name': ['People Working - Inner Courtyard - Church Bells.wav'],
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Music plays nearby as a person sprays on a surface<|endoftext|>'},
 {'file_name': ['Rainy weather.wav'],
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A large motor vehicle engine is running and vibrating<|endoftext|>'},
 {'file_name': ['fireworks.wav'],
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Several explosions and fireworks with some whistling and humming<|endoftext|>'},
 {'file_name': ['TREADMILL_GYM-AMB.wav'],
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A machine is spraying a surface<|endoftext|>'},
 {'file_name': ['Air door sound.wav'],
  'caption': '<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: Wind blows and a vehicle approaches<|endoftext|>'},
 {'file_name

In [32]:
# save predictions as jsonl to output_dir
log_dir = output_dir / "logs"
output_dir.mkdir(parents=True, exist_ok=True)
log_dir.mkdir(parents=True, exist_ok=True)
timestamp = round(time.time())
with open(output_dir / f"{run_name}_{dataset_name}_{dataset_type}_{timestamp}.jsonl", "w") as f:
    for prediction in all_predictions:
        f.write(json.dumps(prediction) + "\n")

# copy generate_config to log_dir
generate_config_dst = log_dir / f"{timestamp}.yaml"
generate_config_dst.write_text(generate_config.read_text())

335

In [32]:
tokenizer.decode(prediction[0])

'<|startoftranscript|><|en|><|transcribe|><|notimestamps|>clotho > caption: A motor vehicle engine is running and vibrating, and people are talking in the background<|endoftext|>'

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

NameError: name 'device' is not defined

<audiocap.data.DataCollatorAudioSeq2SeqWithPadding at 0x1473d0ef2790>

In [None]:
dataloader