In [90]:
import os
import pandas as pd
import sys
import torchaudio
import torch
import numpy as np

In [76]:
main_folder = "../../.."
sys.path.append(main_folder)

# Next, we import the AudioRepresentation class from the audio_representation module.
from tools.audio_representation import AudioRepresentation

In [77]:
experiment_type = "pilot"

In [103]:
features_names = [
    {
        "name": "LogMelSpectrogram",
        "model_checkpoint": None,
        "extra_params": None
    },
    {
        "name": "HumanCochleagram",
        "model_checkpoint": None,
        "extra_params": None
    },
    {
        "name": "wav2vec2",
        "model_checkpoint": "facebook/wav2vec2-base-960h",
        "extra_params": {"layer_number": -1}
    },
    {
        "name": "EcapaTDNN",
        "model_checkpoint": None,
        "extra_params": None
    }
]

In [79]:
data_folder = f"../data/{experiment_type}/"

In [80]:
# Step 1: Read the CSV file
csv_file_path = f'{data_folder}/sample_scores.csv'
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,filename,score
0,sub-3011_ses-t1_elision_brush_audio.wav,1
1,sub-3011_ses-t1_elision_air_audio.wav,1
2,sub-3011_ses-t1_elision_nut_audio.wav,1
3,sub-3011_ses-t1_elision_pop_audio.wav,1
4,sub-3011_ses-t1_elision_ball_audio.wav,1
...,...,...
61,sub-3011_ses-t1_nonword_chaseedoolid_audio.wav,1
62,sub-3011_ses-t1_nonword_bieleedoge_audio.wav,1
63,sub-3011_ses-t1_nonword_voesutoov_audio.wav,1
64,sub-3011_ses-t1_nonword_lisashrul_audio.wav,1


In [81]:
audio_files = []
for subdir, dirs, files in os.walk(data_folder):
    for file in files:
        if file.endswith(".wav"):
            #print(file)
            filepath = os.path.join(subdir, file)
            #print(filepath)
            participant = file.split('_')[0]
            #print(participant)
            subfolder_name = os.path.basename(subdir)
            #print(subfolder_name)

            df.loc[df['filename'] == file, 'filepath'] = filepath
            df.loc[df['filename'] == file, 'task'] = subfolder_name
            df.loc[df['filename'] == file, 'participant'] = participant

print(df.head())

                                  filename  score filepath task participant
0  sub-3011_ses-t1_elision_brush_audio.wav      1      NaN  NaN         NaN
1    sub-3011_ses-t1_elision_air_audio.wav      1      NaN  NaN         NaN
2    sub-3011_ses-t1_elision_nut_audio.wav      1      NaN  NaN         NaN
3    sub-3011_ses-t1_elision_pop_audio.wav      1      NaN  NaN         NaN
4   sub-3011_ses-t1_elision_ball_audio.wav      1      NaN  NaN         NaN


In [82]:
df = df.dropna()
df = df.reset_index(drop=True)
df

Unnamed: 0,filename,score,filepath,task,participant
0,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069
1,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069
2,sub-3011_ses-t1_blending_number_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
3,sub-3011_ses-t1_blending_answer_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
4,sub-3011_ses-t1_blending_it_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
5,sub-3011_ses-t1_blending_toy_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
6,sub-3011_ses-t1_blending_saw_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
7,sub-3011_ses-t1_blending_she_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
8,sub-3011_ses-t1_blending_nap_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011
9,sub-3011_ses-t1_blending_miss_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011


In [104]:
def extract_embeddings(filepath, model_name="EcapaTDNN", extra_params=None, model_checkpoint=None):
    audio_repr = AudioRepresentation(model_name=model_name, model_checkpoint=model_checkpoint, extra_params=extra_params)
    audio, sr = torchaudio.load(filepath)

    if audio_repr.contextual_encoding_exists:
        raw, filtered = audio_repr.contextual_encoding(audio)
    else:
        raw, filtered_encoder_response = audio_repr.pooled_temporal_encoding(audio)
        filtered = filtered_encoder_response[0]['global_mean_plus_max_pooling']
        print(filtered.shape)
    return filtered

for audio_representation_name in features_names:
    df[audio_representation_name['name']] = None 
    
    for index, row in df.iterrows():
        filename = row['filepath']
        print(filename)
        try:
            embedding = extract_embeddings(filename, model_name=audio_representation_name['name'], extra_params=audio_representation_name['extra_params'], model_checkpoint=audio_representation_name['model_checkpoint'])
            # Storing the embedding, which is assumed to be a numpy array, directly in the DataFrame
            df.at[index, audio_representation_name['name']] = [embedding.squeeze()]  # Embedding is assumed to be a numpy array
        except Exception as e:
            print(e)

../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_number_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_answer_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_it_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_toy_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_saw_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_she_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_nap_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_miss_audio.wav
torch.Size([128])
../data/pilot/Blending/sub-3011_ses-t1_blending_bone_audio.wav
cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambi

  filts, hz_cutoffs, freqs = erb.make_erb_cos_filters_nx(batch_signal.shape[1],
  freqs_to_plot = np.log10(freqs)


torch.Size([85])
../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_number_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_answer_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_it_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_toy_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_saw_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_she_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_nap_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_miss_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_bone_audio.wav
min() arg is an empty sequence
../data/pilot/Blending/sub-3011_ses-t1_blending_moon_audio.wav
torch.Size([85])
../data/pilot/Blending/sub-3011_ses-t1_blending_stamp_audio.wav
torch.Size([85])
../data/

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_number_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_answer_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_it_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_toy_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_saw_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_she_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_nap_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_miss_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_bone_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculated padded input size per channel: (0). Kernel size: (10). Kernel size can't be greater than actual input size
../data/pilot/Blending/sub-3011_ses-t1_blending_moon_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-3011_ses-t1_blending_stamp_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_cowboy_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_seashell_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_answer_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_pencil_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_answer_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-2009_ses-t1_blending_mistake_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_pate_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_meb_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_wudoip_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_nigong_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_bieleedoge_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_voesutoov_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_lisashrul_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_wulanuwup_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_teebudieshawlt_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2009_ses-t1_nonword_viversoomouj_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_pate_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_meb_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_wudoip_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_nigong_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_chaseedoolid_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_bieleedoge_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_voesutoov_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3105_ses-t1_nonword_lisashrul_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-2047_ses-t1_nonword_wudoip_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-5260_ses-t1_nonword_jup_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-5260_ses-t1_nonword_zid_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-5260_ses-t1_nonword_chaseedoolid_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_ral_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_sart_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_wudoip_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_nigong_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_chaseedoolid_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_bieleedoge_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_voesutoov_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Nonword Repetition/sub-3011_ses-t1_nonword_lisashrul_audio.wav


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.weight', 'projector.bias', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.masked_spec_embed', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([768])
../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav
../data/pilot/Blending/sub-4069_ses-t1_blending_hammer_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_number_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_answer_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_it_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_toy_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_saw_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_she_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_nap_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_miss_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_bone_audio.wav
Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (2, 2) at dimension 2 of input [1, 80, 1]
../data/pilot/Blending/sub-3011_ses-t1_blending_moon_audio.wav
../data/pilot/Blending/sub-3011_ses-t1_blending_stamp_audio.wav
../data/pilot/Blendi

In [105]:
df

Unnamed: 0,filename,score,filepath,task,participant,LogMelSpectrogram,HumanCochleagram,wav2vec2,EcapaTDNN
0,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069,"[[tensor(7.8228), tensor(8.3147), tensor(8.922...","[[tensor(-10.0709, dtype=torch.float64), tenso...","[[tensor(0.1685), tensor(0.1591), tensor(0.109...","[[tensor(-32.3941), tensor(9.6529), tensor(-18..."
1,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069,"[[tensor(7.8228), tensor(8.3147), tensor(8.922...","[[tensor(-10.0709, dtype=torch.float64), tenso...","[[tensor(0.1685), tensor(0.1591), tensor(0.109...","[[tensor(-32.3941), tensor(9.6529), tensor(-18..."
2,sub-3011_ses-t1_blending_number_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(3.9143), tensor(4.6259), tensor(4.004...","[[tensor(-11.5520, dtype=torch.float64), tenso...","[[tensor(0.2022), tensor(0.1826), tensor(0.115...","[[tensor(-11.9478), tensor(7.8584), tensor(4.1..."
3,sub-3011_ses-t1_blending_answer_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.2990), tensor(2.9968), tensor(3.462...","[[tensor(-11.8145, dtype=torch.float64), tenso...","[[tensor(0.1569), tensor(0.1330), tensor(0.095...","[[tensor(-34.4127), tensor(-3.5166), tensor(-4..."
4,sub-3011_ses-t1_blending_it_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(3.5609), tensor(4.2532), tensor(5.632...","[[tensor(-11.8509, dtype=torch.float64), tenso...","[[tensor(0.2310), tensor(0.1953), tensor(0.364...","[[tensor(14.4482), tensor(-5.5010), tensor(-7...."
5,sub-3011_ses-t1_blending_toy_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(5.1996), tensor(3.8404), tensor(3.646...","[[tensor(-11.3315, dtype=torch.float64), tenso...","[[tensor(0.0100), tensor(0.0512), tensor(0.203...","[[tensor(25.8421), tensor(4.2384), tensor(32.8..."
6,sub-3011_ses-t1_blending_saw_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(5.4181), tensor(5.0872), tensor(5.217...","[[tensor(-11.4549, dtype=torch.float64), tenso...","[[tensor(0.0289), tensor(0.2130), tensor(0.014...","[[tensor(-5.9742), tensor(-24.8454), tensor(-1..."
7,sub-3011_ses-t1_blending_she_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.5519), tensor(5.1570), tensor(5.520...","[[tensor(-11.6660, dtype=torch.float64), tenso...","[[tensor(-0.0611), tensor(0.2238), tensor(0.29...","[[tensor(14.7106), tensor(48.8048), tensor(8.5..."
8,sub-3011_ses-t1_blending_nap_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.3947), tensor(4.3734), tensor(4.345...","[[tensor(-11.7231, dtype=torch.float64), tenso...","[[tensor(0.1571), tensor(0.1199), tensor(0.131...","[[tensor(-18.3186), tensor(-19.1605), tensor(-..."
9,sub-3011_ses-t1_blending_miss_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.3450), tensor(5.0774), tensor(5.967...","[[tensor(-11.5347, dtype=torch.float64), tenso...","[[tensor(0.0146), tensor(0.1114), tensor(0.181...","[[tensor(26.8308), tensor(30.4527), tensor(-0...."


In [110]:
df = df.dropna()
df = df.reset_index(drop=True)
df

Unnamed: 0,filename,score,filepath,task,participant,LogMelSpectrogram,HumanCochleagram,wav2vec2,EcapaTDNN
0,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069,"[[tensor(7.8228), tensor(8.3147), tensor(8.922...","[[tensor(-10.0709, dtype=torch.float64), tenso...","[[tensor(0.1685), tensor(0.1591), tensor(0.109...","[[tensor(-32.3941), tensor(9.6529), tensor(-18..."
1,sub-4069_ses-t1_blending_hammer_audio.wav,0,../data/pilot/Blending/sub-4069_ses-t1_blendin...,Blending,sub-4069,"[[tensor(7.8228), tensor(8.3147), tensor(8.922...","[[tensor(-10.0709, dtype=torch.float64), tenso...","[[tensor(0.1685), tensor(0.1591), tensor(0.109...","[[tensor(-32.3941), tensor(9.6529), tensor(-18..."
2,sub-3011_ses-t1_blending_number_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(3.9143), tensor(4.6259), tensor(4.004...","[[tensor(-11.5520, dtype=torch.float64), tenso...","[[tensor(0.2022), tensor(0.1826), tensor(0.115...","[[tensor(-11.9478), tensor(7.8584), tensor(4.1..."
3,sub-3011_ses-t1_blending_answer_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.2990), tensor(2.9968), tensor(3.462...","[[tensor(-11.8145, dtype=torch.float64), tenso...","[[tensor(0.1569), tensor(0.1330), tensor(0.095...","[[tensor(-34.4127), tensor(-3.5166), tensor(-4..."
4,sub-3011_ses-t1_blending_it_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(3.5609), tensor(4.2532), tensor(5.632...","[[tensor(-11.8509, dtype=torch.float64), tenso...","[[tensor(0.2310), tensor(0.1953), tensor(0.364...","[[tensor(14.4482), tensor(-5.5010), tensor(-7...."
5,sub-3011_ses-t1_blending_toy_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(5.1996), tensor(3.8404), tensor(3.646...","[[tensor(-11.3315, dtype=torch.float64), tenso...","[[tensor(0.0100), tensor(0.0512), tensor(0.203...","[[tensor(25.8421), tensor(4.2384), tensor(32.8..."
6,sub-3011_ses-t1_blending_saw_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(5.4181), tensor(5.0872), tensor(5.217...","[[tensor(-11.4549, dtype=torch.float64), tenso...","[[tensor(0.0289), tensor(0.2130), tensor(0.014...","[[tensor(-5.9742), tensor(-24.8454), tensor(-1..."
7,sub-3011_ses-t1_blending_she_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.5519), tensor(5.1570), tensor(5.520...","[[tensor(-11.6660, dtype=torch.float64), tenso...","[[tensor(-0.0611), tensor(0.2238), tensor(0.29...","[[tensor(14.7106), tensor(48.8048), tensor(8.5..."
8,sub-3011_ses-t1_blending_nap_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.3947), tensor(4.3734), tensor(4.345...","[[tensor(-11.7231, dtype=torch.float64), tenso...","[[tensor(0.1571), tensor(0.1199), tensor(0.131...","[[tensor(-18.3186), tensor(-19.1605), tensor(-..."
9,sub-3011_ses-t1_blending_miss_audio.wav,1,../data/pilot/Blending/sub-3011_ses-t1_blendin...,Blending,sub-3011,"[[tensor(4.3450), tensor(5.0774), tensor(5.967...","[[tensor(-11.5347, dtype=torch.float64), tenso...","[[tensor(0.0146), tensor(0.1114), tensor(0.181...","[[tensor(26.8308), tensor(30.4527), tensor(-0...."


In [None]:
# QUESTIONS:
# Do we have access to the ideal response? (the transcript of the nonword, sentence, ...)

In [112]:
# Function to convert a list of PyTorch tensors to a flattened NumPy array
def tensors_to_numpy(tensor_list):
    # Convert each tensor to a NumPy array, concatenate them, and then flatten the result
    numpy_array = np.concatenate([tensor.numpy() for tensor in tensor_list])
    return numpy_array

# Convert your 'LogMelSpectrogram' data to NumPy arrays
X = np.array([tensors_to_numpy(x) for x in df['LogMelSpectrogram']])
y = df['score'].values

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X, y)

In [None]:
## PREDICTION SHOULD BE DONE ON THE TEST SET, AS SOON AS WE HAVE MORE DATA I CAN WORK ON IT

In [113]:
# Predictions
y_pred = rf_classifier.predict(X)

In [116]:
# Evaluate the model
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y, y_pred))

Accuracy: 97.92%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        22
           1       1.00      0.96      0.98        26

    accuracy                           0.98        48
   macro avg       0.98      0.98      0.98        48
weighted avg       0.98      0.98      0.98        48

