<a href="https://colab.research.google.com/github/rimbarbar/LL_Final_Project/blob/main/LL_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Install dependencies
!apt-get install -y ffmpeg
!pip install --upgrade transformers==4.51.3 datasets==3.5.0 torch==2.6.0 evaluate==0.4.3 pydub==0.25.1 audiomentations==0.40.0 scikit-learn==1.6.1

import os
import shutil
import time
import numpy as np
from google.colab import files
from pydub import AudioSegment
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer, Wav2Vec2Processor, Wav2Vec2CTCTokenizer
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch, Gain
from sklearn.model_selection import KFold
import evaluate
import re
import librosa
import torch

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [14]:
# Clear disk space and Hugging Face cache
def clear_disk_and_cache():
    print("Clearing disk space and Hugging Face cache...")
    shutil.rmtree("/root/.cache/huggingface", ignore_errors=True)
    shutil.rmtree("/content/results", ignore_errors=True)
    shutil.rmtree("/content/logs", ignore_errors=True)
    for d in os.listdir("/content"):
        if d.startswith("facebook") or "wav2vec2" in d:
            shutil.rmtree(os.path.join("/content", d), ignore_errors=True)
    print("Disk and cache cleared.")

# Clear previous files
def clear_workspace():
    print("Clearing workspace...")
    shutil.rmtree("/content/LL_final_project", ignore_errors=True)
    shutil.rmtree("/content/data", ignore_errors=True)
    for f in os.listdir("/content"):
        if f.endswith(".m4a"):
            os.remove(os.path.join("/content", f))
    print("Workspace cleared.")

In [15]:
# Step 1: Upload and organize M4A files
def organize_m4a_files(m4a_dir):
    print("Creating flat directory...")
    os.makedirs(m4a_dir, exist_ok=True)
    print(f"Created directory: {m4a_dir}")
    if not os.path.exists(m4a_dir):
        print(f"Error: Failed to create {m4a_dir}")
        return False, []

    clear_workspace()
    print("Uploading files (select all 70 files, or upload in batches if prompted)...")
    uploaded = files.upload()
    print(f"Uploaded {len(uploaded)} files in first batch.")

    valid_files = []
    for filename in uploaded.keys():
        content_path = os.path.join("/content", filename)
        try:
            with open(content_path, "wb") as f:
                f.write(uploaded[filename])
            try:
                with open(content_path, "rb") as f:
                    f.read(1)
                print(f"Saved and readable: {filename}")
                valid_files.append(filename)
            except Exception as e:
                print(f"Warning: Cannot read {filename}: {e}")
        except Exception as e:
            print(f"Warning: Failed to save {filename}: {e}")

    while len(valid_files) < 70:
        print(f"Only {len(valid_files)} files uploaded. Need {70 - len(valid_files)} more.")
        response = input("Upload more files? (yes/no): ").strip().lower()
        if response != "yes":
            print("Proceeding with available files...")
            break
        print("Uploading additional batch...")
        uploaded = files.upload()
        print(f"Uploaded {len(uploaded)} files in additional batch.")
        for filename in uploaded.keys():
            content_path = os.path.join("/content", filename)
            try:
                with open(content_path, "wb") as f:
                    f.write(uploaded[filename])
                try:
                    with open(content_path, "rb") as f:
                        f.read(1)
                    print(f"Saved and readable: {filename}")
                    if filename not in valid_files:
                        valid_files.append(filename)
                except Exception as e:
                    print(f"Warning: Cannot read {filename}: {e}")
            except Exception as e:
                print(f"Warning: Failed to save {filename}: {e}")

    time.sleep(10)  # Delay for filesystem sync

    print(f"Total valid files in /content/: {len(valid_files)}")
    print("Files in /content/:")
    !ls -l /content/ | grep ".m4a"

    copied_files = []
    for filename in valid_files:
        content_path = os.path.join("/content", filename)
        dest_path = os.path.join(m4a_dir, filename)
        if not os.path.exists(content_path):
            print(f"Warning: Source file missing: {filename}")
            continue
        if os.path.exists(dest_path):
            print(f"Warning: Destination file exists: {dest_path}")
            copied_files.append(filename)
            continue
        print(f"Moving {filename} to {dest_path}")
        try:
            shutil.move(content_path, dest_path)
            print(f"Successfully moved {filename}")
            copied_files.append(filename)
        except Exception as e:
            print(f"Warning: Error moving {filename}: {e}")
            time.sleep(1)
            try:
                shutil.move(content_path, dest_path)
                print(f"Successfully moved {filename} on retry")
                copied_files.append(filename)
            except Exception as e:
                print(f"Skipping {filename} after retry: {e}")

    if not copied_files:
        print("No files moved. Using files in /content/ for conversion...")
        copied_files = valid_files.copy()

    print(f"Total files in {m4a_dir}: {len(copied_files)}")
    print("Verifying file structure:")
    !ls -l {m4a_dir}/* 2>/dev/null || echo "No files in directory"
    !ls -l /content/ | grep ".m4a" || echo "No .m4a files left in /content/"

    file_numbers = []
    for f in copied_files:
        match = re.match(r"New Recording (\d+)\.m4a", f)
        if match:
            file_numbers.append(int(match.group(1)))
        else:
            print(f"Skipping invalid file: {f}")

    expected_numbers = list(range(1, 71))
    if len(file_numbers) < 35:
        print(f"Warning: Too few files. Expected at least 35, got {len(file_numbers)}.")
        print("Detected file numbers:", sorted(file_numbers))
        print("Proceeding anyway...")
    if len(file_numbers) != 70:
        missing = sorted(set(expected_numbers) - set(file_numbers))
        extra = sorted(set(file_numbers) - set(expected_numbers))
        print(f"Warning: Expected 70 files, got {len(file_numbers)}.")
        print("Detected file numbers:", sorted(file_numbers))
        if missing:
            print("Missing files:", [f"New Recording {n}.m4a" for n in missing])
        if extra:
            print("Extra files:", [f"New Recording {n}.m4a" for n in extra])
        print("Continuing with available files...")

    return True, copied_files

In [16]:
# Step 2: Convert M4A to WAV and validate audio
def convert_m4a_to_wav(m4a_dir, data_dir, copied_files):
    print(f"Converting files and validating audio...")
    commands = ["startrecipe", "nextstep", "repeatstep", "timer", "substitute", "scale", "done"]
    command_counters = {cmd: 0 for cmd in commands}

    for filename in sorted(copied_files):
        if filename.endswith(".m4a"):
            match = re.match(r"New Recording (\d+)\.m4a", filename)
            if match:
                num = int(match.group(1))
                if 1 <= num <= 10:
                    cmd = "startrecipe"
                elif 11 <= num <= 20:
                    cmd = "nextstep"
                elif 21 <= num <= 30:
                    cmd = "repeatstep"
                elif 31 <= num <= 40:
                    cmd = "timer"
                elif 41 <= num <= 50:
                    cmd = "substitute"
                elif 51 <= num <= 60:
                    cmd = "scale"
                else:  # 61-70
                    cmd = "done"

                command_dir = os.path.join(data_dir, cmd)
                os.makedirs(command_dir, exist_ok=True)
                command_counters[cmd] += 1
                wav_filename = f"{cmd}_{command_counters[cmd]:02d}.wav"
                m4a_path = os.path.join(m4a_dir, filename)
                if not os.path.exists(m4a_path):
                    m4a_path = os.path.join("/content", filename)
                if not os.path.exists(m4a_path):
                    print(f"Warning: File missing: {filename}")
                    continue
                wav_path = os.path.join(command_dir, wav_filename)

                try:
                    audio = AudioSegment.from_file(m4a_path, format="m4a")
                    audio = audio.set_frame_rate(16000).set_channels(1)  # Mono, 16kHz
                    audio.export(wav_path, format="wav")
                    print(f"Converted {filename} to {wav_filename}")

                    # Validate audio
                    y, sr = librosa.load(wav_path, sr=16000)
                    if len(y) == 0 or np.max(np.abs(y)) < 1e-4:
                        print(f"Warning: {wav_filename} is silent or nearly silent.")
                    else:
                        print(f"Validated {wav_filename}: Duration={len(y)/sr:.2f}s, Max amplitude={np.max(np.abs(y)):.4f}")
                except Exception as e:
                    print(f"Warning: Failed to convert or validate {filename}: {e}")

In [17]:
# Step 3: Load the dataset
def load_audio_dataset(data_dir):
    try:
        dataset = load_dataset("audiofolder", data_dir=data_dir)
        print("Dataset loaded successfully.")
        # integer labels
        label_map = {
            "startrecipe": 0,
            "nextstep": 1,
            "repeatstep": 2,
            "timer": 3,
            "substitute": 4,
            "scale": 5,
            "done": 6
        }
        def assign_labels(example):
            folder_name = os.path.basename(os.path.dirname(example["audio"]["path"]))
            example["label"] = label_map[folder_name]
            return example
        dataset = dataset.map(assign_labels)
        print("Labels assigned:", set(dataset["train"]["label"]))
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

In [18]:
# Step 4: Preprocess the audio data
def preprocess_audio(dataset):
    try:
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-base",
            tokenizer_class="Wav2Vec2CTCTokenizer"
        )
        feature_extractor = processor.feature_extractor
    except Exception as e:
        print(f"Error loading processor: {e}")
        return None

    augment = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
        PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
        TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5),
        Gain(min_gain_db=-6, max_gain_db=6, p=0.5)
    ])

    def preprocess_function(examples):
        audio_arrays = [x["array"] for x in examples["audio"]]
        # Normalize audio
        audio_arrays = [x / np.max(np.abs(x)) if np.max(np.abs(x)) > 0 else x for x in audio_arrays]
        # Apply augmentations
        augmented_arrays = [augment(samples=x, sample_rate=16000) for x in audio_arrays]
        inputs = feature_extractor(
            augmented_arrays,
            sampling_rate=16000,
            max_length=16000,  # Reduced for shorter commands
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        inputs["labels"] = examples["label"]
        return inputs

    encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio"], batched=True)
    return encoded_dataset

In [19]:
# Step 5: Load the model
def load_model():
    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/wav2vec2-base",
        num_labels=7,
        label2id={
            "startrecipe": 0,
            "nextstep": 1,
            "repeatstep": 2,
            "timer": 3,
            "substitute": 4,
            "scale": 5,
            "done": 6
        },
        id2label={str(i): label for i, label in enumerate(["startrecipe", "nextstep", "repeatstep", "timer", "substitute", "scale", "done"])}
    )
    model.classifier.dropout = torch.nn.Dropout(0.3)  # Add dropout
    return model

In [20]:
# Step 6: Define evaluation metrics
def compute_metrics(eval_pred):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")

    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"]
    }

In [23]:
# Step 7: Train and evaluate with k-fold cross-validation
def train_model(encoded_dataset):
    dataset = encoded_dataset["train"]
    from sklearn.model_selection import StratifiedKFold
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    labels = [dataset[i]["label"] for i in range(len(dataset))]

    fold_results = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(range(len(dataset)), labels)):
        print(f"\nTraining fold {fold + 1}/5...")
        train_dataset = dataset.select(train_idx)
        val_dataset = dataset.select(val_idx)

        training_args = TrainingArguments(
            output_dir=f"./results/fold_{fold}",
            num_train_epochs=20,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            eval_strategy="epoch",
            save_strategy="no",
            logging_steps=5,
            learning_rate=5e-5,
            lr_scheduler_type="cosine",
            load_best_model_at_end=False,
            metric_for_best_model="accuracy",
            logging_dir=f"./logs/fold_{fold}",
            weight_decay=0.01,
            gradient_accumulation_steps=2,
            label_smoothing_factor=0.1,  # Prevent numerical issues
            max_grad_norm=1.0  # Clip gradients
        )

        trainer = Trainer(
            model=load_model(),
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        try:
            trainer.train()
        except Exception as e:
            print(f"Warning: Training failed in fold {fold + 1}: {e}")
            continue

        eval_results = trainer.evaluate()
        print(f"Fold {fold + 1} evaluation results:", eval_results)
        fold_results.append(eval_results)

    if fold_results:
        avg_results = {
            "eval_accuracy": np.mean([r["eval_accuracy"] for r in fold_results]),
            "eval_f1": np.mean([r["eval_f1"] for r in fold_results]),
            "eval_precision": np.mean([r["eval_precision"] for r in fold_results]),
            "eval_recall": np.mean([r["eval_recall"] for r in fold_results])
        }
        print("Average cross-validation results:", avg_results)
    else:
        print("No folds completed successfully.")

    return trainer

# Main function
def main():
    clear_disk_and_cache()

    m4a_dir = "/content/LL_final_project"
    data_dir = "/content/data"

    success, copied_files = organize_m4a_files(m4a_dir)
    if not copied_files:
        print("No files available. Cannot proceed.")
        return None

    convert_m4a_to_wav(m4a_dir, data_dir, copied_files)
    dataset = load_audio_dataset(data_dir)
    if dataset is None:
        return None
    encoded_dataset = preprocess_audio(dataset)
    if encoded_dataset is None:
        return None
    trainer = train_model(encoded_dataset)
    return trainer

if __name__ == "__main__":
    main()

Clearing disk space and Hugging Face cache...
Disk and cache cleared.
Creating flat directory...
Created directory: /content/LL_final_project
Clearing workspace...
Workspace cleared.
Uploading files (select all 70 files, or upload in batches if prompted)...


Saving New Recording 70.m4a to New Recording 70.m4a
Saving New Recording 69.m4a to New Recording 69.m4a
Saving New Recording 68.m4a to New Recording 68.m4a
Saving New Recording 67.m4a to New Recording 67.m4a
Saving New Recording 66.m4a to New Recording 66.m4a
Saving New Recording 65.m4a to New Recording 65.m4a
Saving New Recording 64.m4a to New Recording 64.m4a
Saving New Recording 63.m4a to New Recording 63.m4a
Saving New Recording 62.m4a to New Recording 62.m4a
Saving New Recording 61.m4a to New Recording 61.m4a
Saving New Recording 60.m4a to New Recording 60.m4a
Saving New Recording 59.m4a to New Recording 59.m4a
Saving New Recording 58.m4a to New Recording 58.m4a
Saving New Recording 57.m4a to New Recording 57.m4a
Saving New Recording 56.m4a to New Recording 56.m4a
Saving New Recording 55.m4a to New Recording 55.m4a
Saving New Recording 54.m4a to New Recording 54.m4a
Saving New Recording 53.m4a to New Recording 53.m4a
Saving New Recording 52.m4a to New Recording 52.m4a
Saving New R

Resolving data files:   0%|          | 0/70 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/70 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully.


Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Labels assigned: {0, 1, 2, 3, 4, 5, 6}


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]




Training fold 1/5...




pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.9408,1.927461,0.357143,0.261905,0.242857,0.357143
2,1.932,1.927186,0.214286,0.142857,0.119048,0.214286
3,1.8414,1.967477,0.214286,0.155102,0.12381,0.214286
4,1.8533,1.95657,0.214286,0.174603,0.163265,0.214286
5,1.7612,1.910063,0.285714,0.231293,0.242857,0.285714
6,1.6663,1.925874,0.214286,0.188095,0.214286,0.214286
7,1.6963,1.879345,0.285714,0.212245,0.171429,0.285714
8,1.6007,1.822809,0.285714,0.231293,0.242857,0.285714
9,1.4849,1.717955,0.357143,0.283673,0.242857,0.357143
10,1.3737,1.709336,0.571429,0.538095,0.547619,0.571429


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 1 evaluation results: {'eval_loss': 1.6553285121917725, 'eval_accuracy': 0.42857142857142855, 'eval_f1': 0.3666666666666667, 'eval_precision': 0.369047619047619, 'eval_recall': 0.42857142857142855, 'eval_runtime': 1.9957, 'eval_samples_per_second': 7.015, 'eval_steps_per_second': 2.004, 'epoch': 20.0}

Training fold 2/5...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.9417,1.933253,0.142857,0.040816,0.02381,0.142857
2,1.9282,1.901986,0.142857,0.083333,0.059524,0.142857
3,1.872,1.888977,0.214286,0.1678,0.191837,0.214286
4,1.8455,1.88741,0.214286,0.171429,0.196429,0.214286
5,1.7663,1.881599,0.214286,0.171429,0.196429,0.214286
6,1.7169,1.833106,0.214286,0.171429,0.196429,0.214286
7,1.6833,1.799099,0.285714,0.17619,0.142857,0.285714
8,1.6752,1.783283,0.214286,0.143991,0.120408,0.214286
9,1.5078,1.76352,0.214286,0.15,0.119048,0.214286
10,1.4529,1.721424,0.285714,0.177778,0.136054,0.285714


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 2 evaluation results: {'eval_loss': 1.4685747623443604, 'eval_accuracy': 0.5, 'eval_f1': 0.4340136054421769, 'eval_precision': 0.4738095238095238, 'eval_recall': 0.5, 'eval_runtime': 2.2298, 'eval_samples_per_second': 6.278, 'eval_steps_per_second': 1.794, 'epoch': 20.0}

Training fold 3/5...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.9448,1.952041,0.071429,0.019048,0.010989,0.071429
2,1.9327,1.941934,0.214286,0.112245,0.095238,0.214286
3,1.8788,1.946036,0.071429,0.047619,0.035714,0.071429
4,1.8823,1.923618,0.285714,0.193197,0.147619,0.285714
5,1.8271,1.912531,0.214286,0.145578,0.111905,0.214286
6,1.783,1.903953,0.071429,0.040816,0.028571,0.071429
7,1.768,1.868804,0.214286,0.153061,0.128571,0.214286
8,1.6727,1.832643,0.357143,0.328571,0.309524,0.357143
9,1.5766,1.808052,0.357143,0.352381,0.380952,0.357143
10,1.5033,1.749912,0.285714,0.261905,0.25,0.285714


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Fold 3 evaluation results: {'eval_loss': 1.5497591495513916, 'eval_accuracy': 0.5714285714285714, 'eval_f1': 0.5428571428571428, 'eval_precision': 0.5952380952380951, 'eval_recall': 0.5714285714285714, 'eval_runtime': 2.3657, 'eval_samples_per_second': 5.918, 'eval_steps_per_second': 1.691, 'epoch': 20.0}

Training fold 4/5...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.9483,1.938683,0.142857,0.038095,0.021978,0.142857
2,1.9399,1.93329,0.214286,0.176871,0.2,0.214286
3,1.8894,1.86849,0.214286,0.209524,0.238095,0.214286
4,1.8701,1.829125,0.285714,0.209524,0.22619,0.285714
5,1.8042,1.788098,0.357143,0.304762,0.369048,0.357143
6,1.7877,1.732147,0.428571,0.37619,0.345238,0.428571
7,1.7465,1.685153,0.428571,0.342857,0.309524,0.428571
8,1.6467,1.622888,0.428571,0.42381,0.452381,0.428571
9,1.5618,1.604736,0.357143,0.3,0.261905,0.357143
10,1.4771,1.588221,0.5,0.457143,0.5,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fold 4 evaluation results: {'eval_loss': 1.4003946781158447, 'eval_accuracy': 0.7142857142857143, 'eval_f1': 0.7000000000000001, 'eval_precision': 0.6904761904761905, 'eval_recall': 0.7142857142857143, 'eval_runtime': 2.2238, 'eval_samples_per_second': 6.295, 'eval_steps_per_second': 1.799, 'epoch': 20.0}

Training fold 5/5...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.9518,1.923407,0.142857,0.035714,0.020408,0.142857
2,1.9482,1.910814,0.214286,0.095238,0.064286,0.214286
3,1.919,1.917766,0.142857,0.071429,0.05,0.142857
4,1.8968,1.909592,0.357143,0.297959,0.302381,0.357143
5,1.8485,1.900391,0.357143,0.215873,0.159864,0.357143
6,1.7857,1.883788,0.428571,0.404762,0.428571,0.428571
7,1.7192,1.891268,0.428571,0.4,0.416667,0.428571
8,1.6144,1.892384,0.285714,0.27381,0.309524,0.285714
9,1.557,1.86802,0.428571,0.402381,0.428571,0.428571
10,1.4671,1.834386,0.428571,0.461905,0.547619,0.428571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Fold 5 evaluation results: {'eval_loss': 1.7855628728866577, 'eval_accuracy': 0.5, 'eval_f1': 0.4809523809523809, 'eval_precision': 0.5, 'eval_recall': 0.5, 'eval_runtime': 2.4105, 'eval_samples_per_second': 5.808, 'eval_steps_per_second': 1.659, 'epoch': 20.0}
Average cross-validation results: {'eval_accuracy': 0.5428571428571429, 'eval_f1': 0.5048979591836735, 'eval_precision': 0.5257142857142856, 'eval_recall': 0.5428571428571429}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
!zip -r /content/data.zip /content/data/
from google.colab import files
files.download("/content/data.zip")

  adding: content/data/ (stored 0%)
  adding: content/data/substitute/ (stored 0%)
  adding: content/data/substitute/substitute_06.wav (deflated 34%)
  adding: content/data/substitute/substitute_04.wav (deflated 31%)
  adding: content/data/substitute/substitute_10.wav (deflated 35%)
  adding: content/data/substitute/substitute_05.wav (deflated 33%)
  adding: content/data/substitute/substitute_08.wav (deflated 30%)
  adding: content/data/substitute/substitute_03.wav (deflated 30%)
  adding: content/data/substitute/substitute_01.wav (deflated 34%)
  adding: content/data/substitute/substitute_09.wav (deflated 30%)
  adding: content/data/substitute/substitute_07.wav (deflated 29%)
  adding: content/data/substitute/substitute_02.wav (deflated 33%)
  adding: content/data/timer/ (stored 0%)
  adding: content/data/timer/timer_02.wav (deflated 36%)
  adding: content/data/timer/timer_08.wav (deflated 30%)
  adding: content/data/timer/timer_05.wav (deflated 37%)
  adding: content/data/timer/timer

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>