In [1]:
!pip install transformers
!pip install librosa
!pip install torch
import transformers
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import librosa
import librosa.display


[0m

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install standard-aifc standard-sunau

[0m

In [198]:
class StutterCNN(nn.Module):
    def __init__(self):
        super(StutterCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64*32*25, 128)  # Adjust based on new input shape
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        #print(x.shape)
        x = self.pool(torch.relu(self.conv2(x)))
        #print(x.shape)
        x = x.view(x.size(0), -1)
        #print(x.shape)
        x = torch.relu(self.fc1(x))
        #print(x.shape)
        x = self.dropout(x)
        #print(x.shape)
        x = self.fc2(x)
        return x

In [199]:
class StutterDataset(Dataset):
    def __init__(self, max_pad_length=100):
        self.path = '../data/clips/stuttering-clips/clips/'        
        self.df = pd.read_csv('../data/normalized_data.csv').sample(frac=1).reset_index(drop=True)
        
    def extract_features(self, file_path, max_pad_length=100):
        y, sr = librosa.load(file_path, sr=16000)
        if len(y) == 0:
            #print(f"Warning: {file_path} is empty. Skipping.")
            return None
        
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            
        zcr = librosa.feature.zero_crossing_rate(y)  # Detects blocking

        spectral_flatness = librosa.feature.spectral_flatness(y=y)  # Detects prolongation

        rms = librosa.feature.rms(y=y)
        
        features = np.vstack([mel_spec_db, zcr, spectral_flatness, rms])
        if features.shape[1] > max_pad_length:
            features = features[:, :max_pad_length]
        else:
            pad_width = max_pad_length - features.shape[1]
            features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant') 
        return torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        
        return self.extract_features(self.path+data.fil_name), data.label


In [103]:
dataset = StutterDataset()

dataset.__getitem__(0)

(tensor([[[-5.0005e+01, -5.5248e+01, -5.5482e+01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.4287e+01, -4.6208e+01, -5.2384e+01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.1035e+01, -4.0995e+01, -4.2649e+01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 3.7109e-02,  5.1270e-02,  6.7871e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 2.0582e-02,  1.7694e-02,  1.6451e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [ 3.0891e-03,  3.5823e-03,  4.0877e-03,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00]]]),
 np.int64(0))

In [200]:
def train_model(model, train_loader, criterion, optimizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            #print(inputs.shape)
            inputs, labels = inputs.to('cpu', dtype=torch.float32), labels.to('cpu')
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}')

def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    stutter_count=0
    non_stutter_count=0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to('cpu', dtype=torch.float32), labels.to('cpu')
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            for val in predicted:
                if val==1:
                    stutter_count+=1
                else:
                    non_stutter_count+=1
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Test Accuracy: {100 * correct / total:.2f}%')
    print(f'stutters predicted: {stutter_count}, non stutters: {non_stutter_count}')


In [201]:
dataset = StutterDataset()
train_size = int(0.8 * len(dataset))

print(train_size, len(dataset))

test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = StutterCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer,10)
evaluate_model(model, test_loader)

2188 2735
Epoch 1, Loss: 3.7145
Epoch 2, Loss: 0.6583
Epoch 3, Loss: 0.6577
Epoch 4, Loss: 0.6557
Epoch 5, Loss: 0.6537
Epoch 6, Loss: 0.6534
Epoch 7, Loss: 0.6550
Epoch 8, Loss: 0.6537
Epoch 9, Loss: 0.6529
Epoch 10, Loss: 0.6514
Test Accuracy: 63.07%
stutters predicted: 1, non stutters: 546


In [202]:
torch.save(model.state_dict(), './stutter_cnn')

In [203]:
model2 = StutterCNN()
model2.load_state_dict(torch.load('./stutter_cnn'))

<All keys matched successfully>

In [150]:
df = pd.read_csv('../data/processed_data.csv')

In [151]:
row = df.iloc[7509]
path = '../data/clips/stuttering-clips/clips/'

In [122]:
def extract_features(file_path, max_pad_length=100):
        y, sr = librosa.load(file_path, sr=16000)
        if len(y) == 0:
            #print(f"Warning: {file_path} is empty. Skipping.")
            return None
        
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            
        zcr = librosa.feature.zero_crossing_rate(y)  # Detects blocking

        spectral_flatness = librosa.feature.spectral_flatness(y=y)  # Detects prolongation

        rms = librosa.feature.rms(y=y)
        
        features = np.vstack([mel_spec_db, zcr, spectral_flatness, rms])
        if features.shape[1] > max_pad_length:
            features = features[:, :max_pad_length]
        else:
            pad_width = max_pad_length - features.shape[1]
            features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant') 
        return torch.tensor(features, dtype=torch.float32).unsqueeze(0)

In [152]:
features = extract_features(path+row.fil_name)

In [153]:
features.shape

torch.Size([1, 131, 100])

In [156]:
features = features.unsqueeze(0)
features.shape

torch.Size([1, 1, 131, 100])

In [157]:
result = model(features)

In [158]:
_, predicted = torch.max(result, 1)


In [206]:
int(predicted[0])

0

# Wav2Vec

In [163]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

In [172]:
type(librosa.load(path+row.fil_name))

tuple

In [None]:
class Wav2Vec2StutterDataset(Dataset):
    def __init__(self, processor, max_pad_length=100):
        self.path = '../data/clips/stuttering-clips/clips/'        
        self.df = pd.read_csv('../data/normalized_data.csv').sample(frac=1).reset_index(drop=True)
        
    #def extract_features(self, file_path, max_pad_length=100):
    #    y, sr = librosa.load(file_path, sr=16000)
    #    if len(y) == 0:
    #        #print(f"Warning: {file_path} is empty. Skipping.")
    #        return None
    #    
    #    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    #    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    #        
    #    zcr = librosa.feature.zero_crossing_rate(y)  # Detects blocking
#
    #    spectral_flatness = librosa.feature.spectral_flatness(y=y)  # Detects prolongation
#
    #    rms = librosa.feature.rms(y=y)
    #    
    #    features = np.vstack([mel_spec_db, zcr, spectral_flatness, rms])
    #    if features.shape[1] > max_pad_length:
    #        features = features[:, :max_pad_length]
    #    else:
    #        pad_width = max_pad_length - features.shape[1]
    #        features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant') 
    #    return torch.tensor(features, dtype=torch.float32).unsqueeze(0)
    
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        data = self.df.iloc[idx]
        
        return librosa.load(self.path+data.fil_name)[0], data.label


In [166]:
class Wav2VecStutterClassifier(nn.Module):
    def __init__(self, model_name="facebook/wav2vec2-base-960h"):
        super(Wav2VecStutterClassifier, self).__init__()
        self.wav2vec = Wav2Vec2Model.from_pretrained(model_name)
        self.fc = nn.Linear(self.wav2vec.config.hidden_size, 2)  # Binary classification
    
    def forward(self, x):
        x = self.wav2vec(x).last_hidden_state  # Extract embeddings
        x = x.mean(dim=1)  # Average pooling over time
        x = self.fc(x)
        return x

In [167]:
dataset = Wav2Vec2StutterDataset(processor = processor)
train_size = int(0.8 * len(dataset))

print(train_size, len(dataset))

test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Wav2VecStutterClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, criterion, optimizer,5)
evaluate_model(model, test_loader)

1564 1956


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: stack expects each tensor to be equal size, but got [42410] at entry 0 and [66150] at entry 1

In [168]:
model = Wav2VecStutterClassifier()

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [174]:
!pip install torchaudio

Collecting torchaudio
  Downloading torchaudio-2.6.0-cp313-cp313-manylinux1_x86_64.whl.metadata (6.6 kB)
Downloading torchaudio-2.6.0-cp313-cp313-manylinux1_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchaudio
Successfully installed torchaudio-2.6.0
[0m

In [175]:
import torch
import torchaudio

In [176]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H

print("Sample Rate:", bundle.sample_rate)

print("Labels:", bundle.get_labels())

Sample Rate: 16000
Labels: ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')


In [177]:
model = bundle.get_model().to('cpu')
waveform, sr = torchaudio.load(path+row.fil_name)

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:03<00:00, 116MB/s]  


In [180]:
if sr != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sr, bundle.sample_rate)

In [187]:
with torch.inference_mode():
    features, _ = model.extract_features(waveform, num_layers=1)

In [197]:
len(features[0][0][0])

768

# XGBOOS

In [None]:
import librosa
import numpy as np

# Load audio file
file_path = 'audio_file.wav'
y, sr = librosa.load(file_path, sr=None)  # y: audio time series, sr: sample rate

# Generate Mel-spectrogram
n_mels = 128  # Number of Mel bands
n_fft = 2048  # Number of FFT points
hop_length = 512  # Hop length
mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)

# Convert to log scale (log Mel spectrogram)
log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)

# Flatten the spectrogram to create a feature vector
features = log_mel_spectrogram.flatten()


In [231]:
norm_df = pd.read_csv('../data/normalized_data.csv')

In [232]:
norm_df.head()

Unnamed: 0.1,Unnamed: 0,Show,label,fil_name
0,1644,StutterTalk,0,StutterTalk_76_1.wav
1,4792,StutteringIsCool,1,StutteringIsCool_8_30.wav
2,561,WomenWhoStutter,0,WomenWhoStutter_21_179.wav
3,703,StrongVoices,0,StrongVoices_32_158.wav
4,3645,StutterTalk,1,StutterTalk_32_5.wav


In [210]:
norm_df.shape

(2735, 4)

In [216]:
2735/50

54.7

In [220]:
2737*.75

2052.75

In [222]:
2735-2050

685

In [224]:
2050/50

41.0

In [233]:
norm_df=norm_df.sample(frac=1).reset_index(drop=True)

In [234]:
norm_df.head()

Unnamed: 0.1,Unnamed: 0,Show,label,fil_name
0,1287,StutterTalk,0,StutterTalk_0_20.wav
1,1123,HeStutters,0,HeStutters_12_28.wav
2,3012,StrongVoices,1,StrongVoices_34_34.wav
3,639,StutterTalk,0,StutterTalk_6_112.wav
4,1066,FluencyBank,0,FluencyBank_179_21.wav


In [294]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
classifier2 = RandomForestClassifier(n_estimators=50, warm_start=True, class_weight='balanced')
pca = PCA(n_components=50, random_state=42)

In [239]:
path = '../data/clips/stuttering-clips/clips/'

In [295]:
X_test=[]
y_test=[]
for index in range(2050, norm_df.shape[0]):
    #print(index)
    row = norm_df.iloc[index]
    
    file_path = path + row.fil_name
    label = row.label
    
    y, sr = librosa.load(file_path, sr=16000)
        
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    features= mel_spec_db.flatten()
    X_test.append(features)
    y_test.append(y_test)

In [296]:
X_test_reduced = pca.fit_transform(X_test[:64])

In [297]:
X_val = X_test_reduced[:50]
y_val = y_test[:50]



In [289]:
X_val.shape

(50, 50)

In [298]:
for i in range(41):
    start_index = i * 50
    end_index = start_index + 50
    X_train=[]
    y_train=[]
    for j in range(start_index, end_index):
        #print(j)
        file_path = path + norm_df.iloc[j].fil_name
        label = norm_df.iloc[j].label
        
        y, sr = librosa.load(file_path, sr=16000)
            
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        features= mel_spec_db.flatten()

        #print(mel_spec.shape)
        X_train.append(features)
        y_train.append(label)
        #print(start_index, end_index)
        
    X_train_reduced = pca.fit_transform(X_train)
    print(X_train_reduced.shape)
    #print(X_train[0].shape, type(X_train), type(X_test))
        
    classifier2.fit(X_train_reduced, y_train)
    print(classifier2.score(X_val, y_val))
    

(50, 50)


  warn(


KeyboardInterrupt: 

In [301]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torch

# Load processor and model
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels for your task

# Send model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [302]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,   # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for regularization
    bias="none",
    target_modules=["q_proj", "v_proj"]  # Apply LoRA to attention layers
)

# Wrap the wav2vec2 model with LoRA
model = get_peft_model(model, lora_config)

# Print model to check LoRA integration
model.print_trainable_parameters()


trainable params: 294,912 || all params: 94,864,002 || trainable%: 0.3109


In [303]:
norm_df.head()

Unnamed: 0.1,Unnamed: 0,Show,label,fil_name
0,1287,StutterTalk,0,StutterTalk_0_20.wav
1,1123,HeStutters,0,HeStutters_12_28.wav
2,3012,StrongVoices,1,StrongVoices_34_34.wav
3,639,StutterTalk,0,StutterTalk_6_112.wav
4,1066,FluencyBank,0,FluencyBank_179_21.wav


In [304]:
norm_df.shape

(2735, 4)

In [305]:
X_train=[]
y_train=[]

In [306]:
pca = PCA(n_components=50, random_state=42)

In [307]:
for j in range(0, 100):
        #print(j)
    file_path = path + norm_df.iloc[j].fil_name
    label = norm_df.iloc[j].label
    
    y, sr = librosa.load(file_path, sr=16000)
        
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    features= mel_spec_db.flatten()

    #print(mel_spec.shape)
    X_train.append(features)
    y_train.append(label)
        #print(start_index, end_index)
        
X_train_reduced = pca.fit_transform(X_train)

In [308]:
X_test=[]
y_test=[]

In [309]:
for j in range(101, 150):
        #print(j)
    file_path = path + norm_df.iloc[j].fil_name
    label = norm_df.iloc[j].label
    
    y, sr = librosa.load(file_path, sr=16000)
        
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    features= mel_spec_db.flatten()

    #print(mel_spec.shape)
    X_test.append(features)
    y_test.append(label)
        #print(start_index, end_index)
        
X_test_reduced = pca.fit_transform(X_train)

In [323]:
type(X_train_reduced)

numpy.ndarray

In [321]:
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(X_train_reduced, y_train)

TypeError: 'int' object is not callable

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=2,
    logging_dir="./logs",
    fp16=True  # Enable mixed precision training
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

# Train the model
trainer.train()


In [325]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [326]:
model.classifier.out_proj = torch.nn.Linear(X_train_reduced.shape[1], 2)

In [327]:
loss_fn = torch.nn.CrossEntropyLoss()

In [332]:
X_train_reduced = torch.tensor(X_train_reduced, dtype=torch.float32)
type(y_train)

  X_train_reduced = torch.tensor(X_train_reduced, dtype=torch.float32)


list

In [333]:
X_train_reduced.dtype

torch.float32

In [334]:
for epoch in range(2):
    optimizer.zero_grad()
    logits=model(X_train_reduced).logits
    loss = loss_fn(logits, y_train )
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}: Loss = {loss.item()}")

RuntimeError: Calculated padded input size per channel: (1). Kernel size: (3). Kernel size can't be greater than actual input size