## extracting mfcc features of audio

In [1]:
import librosa
import numpy as np

In [2]:
def mfcc_extraction(path):
    x,sr=librosa.load(path)
    mfcc=np.mean(librosa.feature.mfcc(y=x, sr=sr).T, axis=0)
    return mfcc
path=r"C:\Users\U$ER\Documents\audio processing\check_data\checkaudio.wav"

In [3]:
features=mfcc_extraction(path)

In [4]:
features.shape

(20,)

## extract features using wav2vec2

In [5]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torch
import transformers
transformers.logging.set_verbosity_error()

In [6]:
m="facebook/wav2vec2-base-960h"
preprocessor=Wav2Vec2Processor.from_pretrained(m)
model=Wav2Vec2Model.from_pretrained(m)

In [56]:
def wav2vec2_extraction(path):
    m="facebook/wav2vec2-base-960h"
    preprocessor=Wav2Vec2Processor.from_pretrained(m)
    model=Wav2Vec2Model.from_pretrained(m)
    audio,sr=librosa.load(path, sr=16000)
    inputs=preprocessor(audio, sampling_rate=sr, padding=True,return_tensors='pt')
    with torch.no_grad():
        features=model(**inputs).last_hidden_state
    np_features=features.squeeze().cpu().numpy()
    return np_features

In [8]:
path=r"C:\Users\U$ER\Documents\audio processing\check_data\checkaudio.wav"
features=wav2vec2_extraction(path)

In [9]:
features.shape

(892, 768)

In [10]:
import numpy as np
def truncate(features, target_size):
    current_size=features.shape[0]
    if current_size>=target_size:
        return features[:target_size, :]
    else:
        #padding with zeros
        pad_width=target_size-current_size
        padding=np.zeros((pad_width, features.shape[1]))
        return np.vstack((features, padding))

In [11]:
truncate(features, 50).shape

(50, 768)

In [12]:
from sklearn.decomposition import PCA


def reduce_features(features, max_components=50):
    n_samples = features.shape[0]
    n_components = min(n_samples, max_components)  # Avoid exceeding available samples
    
    pca = PCA(n_components=n_components)
    reduced_features = pca.fit_transform(features)  # Shape (n_samples, n_components)
    return np.mean(reduced_features, axis=0) 

## saving functions in pickle file

In [57]:
import dill

# Save each function separately
with open('mfcc_extraction.pkl', 'wb') as f:
    dill.dump(mfcc_extraction, f)

with open('wav2vec2_extraction.pkl', 'wb') as f:
    dill.dump(wav2vec2_extraction, f)

with open('reduce_features.pkl', 'wb') as f:
    dill.dump(reduce_features, f)


## combining mfcc and wav2vec2 features and preparing them to train ML model

In [13]:
real_audios="Audio Dataset/real"
fake_audios="Audio Dataset/fake"

In [14]:
# Initialize lists to store MFCC features and corresponding labels
final_features = []
labels = []



In [15]:
# Process real audio files
import os
for filename in os.listdir(real_audios):
    file_path = os.path.join(real_audios, filename)

    #extract mfcc features
    features1 = mfcc_extraction(file_path)
    #extract wav2vec2 features of audio
    features2=wav2vec2_extraction(file_path)
    features2=reduce_features(features2, max_components=50)
    #combine features in horizontal stack
    combine_features=np.hstack((features1, features2))
    final_features.append(combine_features)
    labels.append(0)  # Label 0 for real audio

In [16]:
# Process fake audio files
import os
for filename in os.listdir(fake_audios):
    file_path = os.path.join(fake_audios, filename)

    #extract mfcc features
    features1 = mfcc_extraction(file_path)
    #extract wav2vec2 features of audio
    features2=wav2vec2_extraction(file_path)
    features2=reduce_features(features2, max_components=50)
    #combine features in horizontal stack
    combine_features=np.hstack((features1, features2))
    final_features.append(combine_features)
    labels.append(1)  # Label 1 for fake audio

## saving features in a pickle file

In [48]:
import pickle
with open('features.pkl', 'wb') as f:
    pickle.dump(final_features, f)


In [50]:
with open('labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

In [17]:
feature_lengths = [len(f) for f in final_features]
print("Min length:", min(feature_lengths))
print("Max length:", max(feature_lengths))


Min length: 49
Max length: 70


In [18]:
import numpy as np

max_len = max(len(f) for f in final_features)  # Find the longest feature vector

padded_features = np.array([
    np.pad(f, (0, max_len - len(f)), mode='constant') for f in final_features
])

print("Padded Features Shape:", padded_features.shape)  # Should be (num_samples, max_len)


Padded Features Shape: (9988, 70)


In [19]:
final_features = padded_features  # or processed_features if truncated
labels = np.array(labels)  # Convert labels to NumPy

print("Final Shape:", final_features.shape)


Final Shape: (9988, 70)


In [20]:
type(final_features)

numpy.ndarray

In [21]:
type(final_features[0])

numpy.ndarray

In [22]:
final_features[1].shape

(70,)