In [5]:
import os
import sys
import soundfile
import random
import pandas as pd
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import Dataset
from sklearn.model_selection import KFold
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Load Charsiu model

## From Fiona's code
## Find where Charsiu is located on lab computer
# change this path to where you saved the charsiu package
charsiu_dir = '/Users/cogsci-lasrlab1/Downloads/charsiu-main'
os.chdir(charsiu_dir)

sys.path.append('%s/src/' % charsiu_dir)

from Charsiu import charsiu_forced_aligner



NameError: name 'os' is not defined

In [None]:
# Set random seed
random.seed(1234)
torch.manual_seed(1234)

# Load participant sets CSV
participant_df = pd.read_csv("participant_sets.csv")
training_ids = participant_df[participant_df['set'].str.lower() == 'training']['ParticipantID'].str.lower().tolist()

# Load csv of incorrect utterances
# Create set of included files for training
inclusion_df = pd.read_csv("incorrect_utterances.csv")
inclusion_df.head()
included_files = inclusion_df[inclusion_df['Subject inclusion'].str.lower() == 'include']['Filename'].tolist()
included_files = set([f.lower() for f in included_files])  # normalize case

In [None]:
# extract info from KT1 files
base_dir = "KT1"
data = []
excluded_count = 0

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.endswith('.wav') and "participant_" in file:
                base = file[:-4]  # remove '.wav'
                try:
                    # Split at 'participant_'
                    before, word = base.split("participant_")
                    # Remove 'K1' and researcher number before 'participant_'
                    ppt_id = before.replace("K1", "")[:-1]

                    # Check if included in training data
                    if ppt_id.lower() in training_ids:
                        if file.lower() in included_files:
                            data.append({
                                "file_name": os.path.join(folder_path, file),
                                "ppt_id": ppt_id,
                                "transcription": word
                            })
                            
                            # Write .txt file
                            txt_path = os.path.join(folder_path, f"{os.path.splitext(file)[0]}.txt")
                            with open(txt_path, 'w') as f:
                                f.write(word)
                                
                    # Count number of excluded files
                        else:
                            excluded_count += 1
                except ValueError:
                    continue

# Convert to DataFrame
df = pd.DataFrame(data)

# Report
print(f"{excluded_count} files were excluded due to incorrect word.")
print(f"{len(df)} files included.")

FileNotFoundError: [Errno 2] No such file or directory: 'KT1'

In [7]:
# Load Wav2Vec model and processor
base_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Configure LoRA arguments
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"]
)

# Wrap the base Wav2Vec2 model with LoRA adapters
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 94,691,232 || trainable%: 0.3114


In [1]:
def preprocess(batch):
    waveform, sr = torchaudio.load(batch["file_name"])
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)

    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    with processor.as_target_processor():
        labels = processor(batch["transcription"], return_tensors="pt").input_ids[0]

    return {
        "input_values": inputs["input_values"][0],
        "attention_mask": inputs["attention_mask"][0],
        "labels": labels
    }

In [None]:
# Convert data from pandas dataframe to transformers dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess, remove_columns=["file_name", "ppt_id", "transcription"])

NameError: name 'Dataset' is not defined

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_charsiu_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=5e-4,
    weight_decay=0.01,
    num_train_epochs=1,
    fp16=True,
    gradient_accumulation_steps=8,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor
)

In [None]:
trainer.train()
model.save_pretrained("./lora_charsiu_finetuned")
processor.save_pretrained("./lora_charsiu_finetuned")