In [1]:
! pip install -q -U transformers==4.28.0 accelerate

[0m

In [2]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoFeatureExtractor, AutoTokenizer, AutoModelForAudioClassification, TrainingArguments, Trainer, pipeline
from datasets import Dataset, Features, Value, Audio, ClassLabel, DatasetInfo, NamedSplit, DatasetDict, load_dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
workspace  = Path('../../Working/DAIC-WOZ').resolve()

csv_root = workspace/'csv'

csvs = {
    'train': csv_root/'train.csv',
    'dev'  : csv_root/'dev.csv',
    'test' : csv_root/'test.csv',
}

base_models = [
    'facebook/wav2vec2-base',
    'facebook/hubert-base-ls960',
    'microsoft/wavlm-base-plus',
]

In [4]:
def load_csv(csv, istest=False):

    old_title = [
        'audio_file',
        'audio_file',
        'script',
        'Participant_ID',
        'response_id',
        'audio_file',
        'PHQ8_Binary',
    ]

    new_title = [
        'file',
        'audio',
        'text',
        'speaker_id',
        'chapter_id',
        'id',
        'label',
    ]

    if istest:
        old_title, new_title = old_title[:-1], new_title[:-1]
        old_title[3] = 'participant_ID'

    df = pd.read_csv(csv)[old_title]
    df.columns=new_title
    df['id'] = df['id'].map(lambda x: Path(x).stem)

    return df

In [5]:
def make_features_and_info(istest=False):

    features_dict = {
        'file': Value(dtype='string'),
        'audio': Value(dtype='string'),
        # 'audio': Audio(sampling_rate=16000, mono=True, decode=True),
        'text': Value(dtype='string'),
        'speaker_id': Value(dtype='int64'),
        'chapter_id': Value(dtype='int64'),
        'id': Value(dtype='string'),
    }

    if not istest:
        features_dict['label'] = ClassLabel(num_classes=2, names=['healthy', 'depressed'])
        
    features = Features(features_dict)

    info = DatasetInfo(
        description='DAIC-WOZ',
        features=features,
    )

    return {
        'features': features,
        'info': info,
    }

In [6]:
dataset = DatasetDict({
    ds: Dataset.from_pandas(
        load_csv(csv, istest=ds=='test'),
        **make_features_and_info(istest=ds=='test'),
        split=NamedSplit(ds),
    ) for ds, csv in tqdm(csvs.items(), total=len(csvs))
}) \
.cast_column('audio', Audio(
    sampling_rate=16000,
    mono=True,
    decode=True,
))

  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id', 'label'],
        num_rows: 16906
    })
    dev: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id', 'label'],
        num_rows: 6679
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        num_rows: 8816
    })
})

In [8]:
dataset['train']

Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id', 'label'],
    num_rows: 16906
})

In [9]:
dataset['train'].features

{'file': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'text': Value(dtype='string', id=None),
 'speaker_id': Value(dtype='int64', id=None),
 'chapter_id': Value(dtype='int64', id=None),
 'id': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['healthy', 'depressed'], id=None)}

In [10]:
dataset['train'][0]

{'file': '/com.docker.devenvironments.code/Working/DAIC-WOZ/audio/train/303/segment/0.wav',
 'audio': {'path': '/com.docker.devenvironments.code/Working/DAIC-WOZ/audio/train/303/segment/0.wav',
  'array': array([ 0.01864624,  0.02023315,  0.02032471, ..., -0.00506592,
         -0.00537109, -0.00537109], dtype=float32),
  'sampling_rate': 16000},
 'text': "okay how 'bout yourself",
 'speaker_id': 303,
 'chapter_id': 1,
 'id': '0',
 'label': 0}

In [11]:
model = AutoModelForAudioClassification.from_pretrained(base_models[0])
tokenizer = AutoTokenizer.from_pretrained(base_models[0])
feature_extractor = AutoFeatureExtractor.from_pretrained(base_models[0])
pipe = pipeline(
    "audio-classification",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.weight', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight', 'quantizer.codevectors', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.weight', 'classifier

Downloading (…)okenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [12]:
dataset = dataset.cast_column('audio', Audio(sampling_rate=pipe.feature_extractor.sampling_rate))

In [13]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [14]:
def get_array_from_audio(dataset):
    audio_arrays = [x["array"] for x in dataset["audio"]]
    return audio_arrays

In [16]:
pipe(get_array_from_audio(dataset['dev'][:5]))

[[{'score': 0.5021592378616333, 'label': 'LABEL_0'},
  {'score': 0.4978407323360443, 'label': 'LABEL_1'}],
 [{'score': 0.5028910040855408, 'label': 'LABEL_1'},
  {'score': 0.497109055519104, 'label': 'LABEL_0'}],
 [{'score': 0.5126659870147705, 'label': 'LABEL_0'},
  {'score': 0.4873340129852295, 'label': 'LABEL_1'}],
 [{'score': 0.5033653378486633, 'label': 'LABEL_1'},
  {'score': 0.4966345727443695, 'label': 'LABEL_0'}],
 [{'score': 0.5009957551956177, 'label': 'LABEL_1'},
  {'score': 0.4990042448043823, 'label': 'LABEL_0'}]]

In [17]:
AutoModelForAudioClassification.from_pretrained?

[0;31mSignature:[0m [0mAutoModelForAudioClassification[0m[0;34m.[0m[0mfrom_pretrained[0m[0;34m([0m[0;34m*[0m[0mmodel_args[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Instantiate one of the model classes of the library (with a audio classification head) from a pretrained model.

The model class to instantiate is selected based on the `model_type` property of the config object (either
passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
falling back to using pattern matching on `pretrained_model_name_or_path`:

    - **audio-spectrogram-transformer** -- [`ASTForAudioClassification`] (Audio Spectrogram Transformer model)
    - **data2vec-audio** -- [`Data2VecAudioForSequenceClassification`] (Data2VecAudio model)
    - **hubert** -- [`HubertForSequenceClassification`] (Hubert model)
    - **sew** -- [`SEWForSequenceClassification`] (SEW model)
    - **sew-d** -- [

In [29]:
for model in base_models:
    pipeline(model=model, task='audio-classification')

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.weight', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_hid.weight', 'quantizer.codevectors', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.weight', 'classifier

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['projector.bias', 'classifier.weight', 'classifier.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)rocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of WavLMForSequenceClassification were not initialized from the model checkpoint at microsoft/wavlm-base-plus and are newly initialized: ['projector.bias', 'classifier.weight', 'classifier.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]