In [1]:
import os
import librosa
import soundfile as sf
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score
from transformers import Wav2Vec2Processor, Wav2Vec2Model

In [2]:
# Optional: try to use torchaudio's resampler for speed/quality; else fallback to librosa
try:
    import torchaudio
    TORCHAUDIO_AVAILABLE = True
except Exception:
    TORCHAUDIO_AVAILABLE = False
    try:
        import librosa
        LIBROSA_AVAILABLE = True
    except Exception:
        LIBROSA_AVAILABLE = False

In [3]:
# Load these once (do not reload for each file) for efficiency
# choose a model appropriate for your needs; "facebook/wav2vec2-base" is a common base model
MODEL_NAME = "facebook/wav2vec2-base"  # or "facebook/wav2vec2-base-960h", or a finetuned checkpoint
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME)
model.eval()



Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [4]:
# Put on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [5]:
def _resample_waveform(waveform: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    if orig_sr == target_sr:
        return waveform
    if TORCHAUDIO_AVAILABLE:
        # waveform shape expected: (channels, time). Our waveform is 1D mono so expand dims
        wav_t = torch.from_numpy(waveform).float().unsqueeze(0)  # (1, time)
        resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
        wav_resampled = resampler(wav_t)
        return wav_resampled.squeeze(0).numpy()
    elif LIBROSA_AVAILABLE:
        # librosa.resample expects float32
        return librosa.resample(waveform.astype("float32"), orig_sr, target_sr)
    else:
        raise RuntimeError(
            "No resampler available: install torchaudio or librosa to enable resampling."
        )

In [6]:
def extract_features(file_path: str, model_processor=processor, model_w2v=model, device=device) -> np.ndarray:
    """
    Extract a fixed-size wav2vec2 embedding for an audio file.
    Returns a 1D numpy array of size = model.hidden_size (e.g. 768).
    """
    # 1) Load audio (soundfile reads sample rate and waveform)
    audio, sr = sf.read(file_path)  # audio shape: (n_samples,) or (n_samples, channels)
    # If stereo/multi-channel, take mean to convert to mono
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)

    # 2) Resample if needed to the processor's sampling rate
    target_sr = model_processor.feature_extractor.sampling_rate
    if sr != target_sr:
        audio = _resample_waveform(audio, sr, target_sr)
        sr = target_sr

    # 3) Preprocess: the processor expects list of arrays (batch)
    inputs = model_processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    # Move tensors to device
    input_values = inputs.input_values.to(device)  # shape: (batch=1, seq_len)
    attention_mask = inputs.attention_mask.to(device) if "attention_mask" in inputs else None
    
    # 4) Forward pass
    with torch.no_grad():
        # Request hidden states if you want; default returns last_hidden_state
        outputs = model_w2v(input_values, attention_mask=attention_mask)
        # last_hidden_state: (batch, seq_len, hidden_size)
        last_hidden_state = outputs.last_hidden_state

    # 5) Mean-pool across time dimension to get a single vector (you can use other pooling: max, CLS-like)
    # If attention_mask exists, compute masked mean to ignore padded positions
    if attention_mask is not None:
        mask = attention_mask.unsqueeze(-1)  # (batch, seq_len, 1)
        masked_states = last_hidden_state * mask
        summed = masked_states.sum(dim=1)       # sum over time -> (batch, hidden)
        lengths = mask.sum(dim=1)               # number of valid frames -> (batch, 1)
        mean_pooled = summed / lengths.clamp(min=1e-9)
    else:
        mean_pooled = last_hidden_state.mean(dim=1)  # (batch, hidden_size)

    # Convert to numpy and return 1D array
    embedding = mean_pooled.squeeze(0).cpu().numpy()
    return embedding


In [7]:
example_file = "/Users/shanoonissaka/Documents/school/thesis-project/datasets/audio/for-norm/testing/fake/file1_wav_16k_wav_norm_wav_mono_wav_silence.wav"
emb = extract_features(example_file)
print("Embedding shape:", emb.shape) 

AttributeError: module 'tensorflow' has no attribute 'Tensor'

In [3]:
def extract_features(file_path):
  audio, sample_rate = librosa.load(file_path, sr=None)
  mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13)
  return np.mean(mfccs.T, axis=0)

In [4]:
# Load real and fake audio samples (replace with your actual file paths)
# Define the paths for real and fake audio files
real_audio_path = '/Users/shanoonissaka/Documents/school/thesis-project/datasets/audio/for-norm/testing/real'
fake_audio_path = '/Users/shanoonissaka/Documents/school/thesis-project/datasets/audio/for-norm/testing/fake'


In [5]:
# Gather all the .wav files from the directories
real_audio_files = [os.path.join(real_audio_path, file) for file in os.listdir(real_audio_path) if file.endswith('.wav')]
fake_audio_files = [os.path.join(fake_audio_path, file) for file in os.listdir(fake_audio_path) if file.endswith('.wav')]


In [6]:
data = []
labels = []
for file in real_audio_files:
  data.append(extract_features(file))
  labels.append(0)  # Label 0 for real audio

In [7]:
for file in fake_audio_files:
  data.append(extract_features(file))
  labels.append(1)  # Label 1 for fake audio

In [8]:
X = np.array(data)
y = np.array(labels)

In [9]:
# # save to CSV
# import pandas as pd
# df = pd.DataFrame(X)
# df['label'] = y
# df.to_csv('/Users/shanoonissaka/Documents/school/thesis-project/code/audio-detect/data/features/audio_features_labels.csv', index=False)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# use lazy prediction
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

 97%|█████████▋| 28/29 [00:02<00:00,  9.65it/s]

[LightGBM] [Info] Number of positive: 1916, number of negative: 1791
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 3707, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516860 -> initscore=0.067466
[LightGBM] [Info] Start training from score 0.067466


100%|██████████| 29/29 [00:03<00:00,  8.57it/s]


In [12]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
KNeighborsClassifier               1.00               1.00     1.00      1.00   
SVC                                1.00               1.00     1.00      1.00   
QuadraticDiscriminantAnalysis      1.00               1.00     1.00      1.00   
ExtraTreesClassifier               1.00               1.00     1.00      1.00   
AdaBoostClassifier                 1.00               1.00     1.00      1.00   
SGDClassifier                      1.00               1.00     1.00      1.00   
PassiveAggressiveClassifier        1.00               1.00     1.00      1.00   
LogisticRegression                 1.00               1.00     1.00      1.00   
LinearSVC                          1.00               1.00     1.00      1.00   
CalibratedClassifierCV             1.00               1.00     1.00      1.00   
Perceptron                  