In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from scipy.stats import skew, kurtosis, entropy
from scipy.signal import hilbert
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
from sklearn.preprocessing import normalize
from scipy import signal
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
import librosa.display



# mfcc

In [None]:
SAMPLE_RATE = 16000
N_MFCC = 40
HOP_LENGTH = 512
BATCH_SIZE = 1000

def extract_acoustic_features(file_path):
    """
    Extract acoustic features from audio file using Librosa
    Returns: Dictionary of features
    """

    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    features = {}

    # # 1. Time-domain features
    # features['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=y))
    # features['rms_mean'] = np.mean(librosa.feature.rms(y=y))

    # # 2. Spectral features
    # S = librosa.magphase(librosa.stft(y=y, hop_length=HOP_LENGTH))[0]

    # # Spectral centroid
    # spectral_centroid = librosa.feature.spectral_centroid(S=S)
    # features['spectral_centroid_mean'] = np.mean(spectral_centroid)
    # features['spectral_centroid_std'] = np.std(spectral_centroid)

    # # Spectral bandwidth
    # spectral_bandwidth = librosa.feature.spectral_bandwidth(S=S)
    # features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
    # features['spectral_bandwidth_std'] = np.std(spectral_bandwidth)

    # # Spectral rolloff
    # spectral_rolloff = librosa.feature.spectral_rolloff(S=S)
    # features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
    # features['spectral_rolloff_std'] = np.std(spectral_rolloff)

    # 3. MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH)
    for i in range(N_MFCC):
        features[f'mfcc_{i+1}_mean'] = np.mean(mfcc[i])
        features[f'mfcc_{i+1}_std'] = np.std(mfcc[i])

    # 4. Chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=HOP_LENGTH)
    features['chroma_mean'] = np.mean(chroma)
    features['chroma_std'] = np.std(chroma)

    # # 5. Spectral contrast
    # spectral_contrast = librosa.feature.spectral_contrast(S=S)
    # features['spectral_contrast_mean'] = np.mean(spectral_contrast)
    # features['spectral_contrast_std'] = np.std(spectral_contrast)

    # # 6. Tonnetz features
    # tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    # features['tonnetz_mean'] = np.mean(tonnetz)
    # features['tonnetz_std'] = np.std(tonnetz)

    # 7. Delta features (MFCC derivatives)
    delta_mfcc = librosa.feature.delta(mfcc)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)

    for i in range(N_MFCC):
        features[f'delta_mfcc_{i+1}_mean'] = np.mean(delta_mfcc[i])
        features[f'delta2_mfcc_{i+1}_mean'] = np.mean(delta2_mfcc[i])

    return features

def process_audio_files_with_batching(metadata_path, output_csv):
    """
    Process audio files in batches with progress tracking
    """
    df = pd.read_csv(metadata_path)
    total_files = len(df)
    batches = total_files // BATCH_SIZE + (1 if total_files % BATCH_SIZE else 0)
    """ all_features = []
    all_labels = []
    all_splits = []
    feature_columns = None"""
    first_batch = True
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    master_pbar = tqdm(total=total_files, desc="Total progress", position=0)

    for batch_num in range(batches):
        batch_start = batch_num * BATCH_SIZE
        batch_end = min((batch_num + 1) * BATCH_SIZE, total_files)
        batch_files = df.iloc[batch_start:batch_end]

        batch_features = []
        batch_errors = 0

        batch_pbar = tqdm(total=len(batch_files), desc=f"Batch {batch_num+1}/{batches}", position=1, leave=False)

        for idx, row in batch_files.iterrows():
            try:
                features = extract_acoustic_features(row['file_path'])
                features['file_path'] = row['file_path']
                features['filename'] = row['filename']
                features['label'] = row['label']
                features['split'] = row['split']
                batch_features.append(features)
                """if feature_columns is None:
                    feature_columns = [k for k in features.keys() if k not in ['label', 'split']]

                Append to numpy storage
                all_features.append([features[k] for k in feature_columns])
                all_labels.append(features['label'])
                all_splits.append(features['split'])"""

            except Exception as e:
                batch_errors += 1
                tqdm.write(f"Error in {row['file_path']}: {str(e)}")
            finally:
                batch_pbar.update(1)
                master_pbar.update(1)

        batch_pbar.close()

        if batch_features:
            batch_df = pd.DataFrame(batch_features)
            write_header = not os.path.exists(output_csv) or first_batch
            batch_df.to_csv(output_csv, mode='a', header=write_header, index=False)
            first_batch = False

        tqdm.write(f"\nBatch {batch_num+1}/{batches} completed:")
        tqdm.write(f" - Processed: {len(batch_files)} files")
        tqdm.write(f" - Successful: {len(batch_files) - batch_errors}")
        tqdm.write(f" - Errors: {batch_errors}")
        tqdm.write(f" - Cumulative saved: {(batch_num+1)*BATCH_SIZE} files")
        tqdm.write("-" * 50)

        time.sleep(0.1)

    master_pbar.close()
    print(f"\nFeature extraction complete. Results saved to {output_csv}")

    """ if feature_columns:
        features_array = np.array(all_features, dtype=np.float32)
        labels_array = np.array(all_labels)
        splits_array = np.array(all_splits)

        np.save('features.npy', features_array)
        np.save('labels.npy', labels_array)
        np.save('splits.npy', splits_array)
        print("\nNumPy files saved:")
        print(f" - features.npy: {features_array.shape}")
        print(f" - labels.npy: {labels_array.shape}")
        print(f" - splits.npy: {splits_array.shape}")
    else:
        print("\nWarning: No features extracted for NumPy files")

    print(f"\nCSV output saved to {output_csv}") """

if __name__ == "__main__":
    metadata_path = "/content/drive/MyDrive/project/metadata.csv"
    output_csv = "/content/drive/MyDrive/project/features/mfcc.csv"

    if os.path.exists(output_csv):
        os.remove(output_csv)
    for f in ['features.npy', 'labels.npy', 'splits.npy']:
        if os.path.exists(f):
            os.remove(f)

    process_audio_files_with_batching(metadata_path, output_csv)

Total progress:   0%|          | 0/1600 [00:00<?, ?it/s]
Batch 1/2:   0%|          | 0/1000 [00:00<?, ?it/s][A
Total progress:   0%|          | 1/1600 [00:23<10:35:16, 23.84s/it]
Total progress:   0%|          | 2/1600 [00:24<4:26:13, 10.00s/it] 
Total progress:   0%|          | 3/1600 [00:24<2:29:02,  5.60s/it]
Total progress:   0%|          | 4/1600 [00:24<1:32:00,  3.46s/it]
Total progress:   0%|          | 5/1600 [00:24<1:01:13,  2.30s/it]
Total progress:   0%|          | 6/1600 [00:25<42:33,  1.60s/it]  
Total progress:   0%|          | 7/1600 [00:25<30:15,  1.14s/it]
Total progress:   0%|          | 8/1600 [00:25<22:19,  1.19it/s]
Total progress:   1%|          | 9/1600 [00:25<17:48,  1.49it/s]
Total progress:   1%|          | 10/1600 [00:26<13:45,  1.93it/s]
Total progress:   1%|          | 11/1600 [00:26<11:33,  2.29it/s]
  return pitch_tuning(

Total progress:   1%|          | 13/1600 [00:26<09:22,  2.82it/s]
Total progress:   1%|          | 14/1600 [00:27<08:07,  3.26it/s]
T


Batch 1/2 completed:
 - Processed: 1000 files
 - Successful: 1000
 - Errors: 0
 - Cumulative saved: 1000 files
--------------------------------------------------



Batch 2/2:   0%|          | 0/600 [00:00<?, ?it/s][A
Total progress:  63%|██████▎   | 1001/1600 [04:19<05:00,  1.99it/s]
Total progress:  63%|██████▎   | 1002/1600 [04:19<04:27,  2.24it/s]
Total progress:  63%|██████▎   | 1003/1600 [04:19<04:03,  2.45it/s]
Total progress:  63%|██████▎   | 1004/1600 [04:20<03:35,  2.76it/s]
Total progress:  63%|██████▎   | 1005/1600 [04:20<03:12,  3.09it/s]
Total progress:  63%|██████▎   | 1006/1600 [04:20<03:03,  3.24it/s]
Total progress:  63%|██████▎   | 1007/1600 [04:20<02:39,  3.71it/s]
Total progress:  63%|██████▎   | 1008/1600 [04:20<02:22,  4.16it/s]
Total progress:  63%|██████▎   | 1009/1600 [04:21<02:15,  4.35it/s]
Total progress:  63%|██████▎   | 1010/1600 [04:21<02:33,  3.86it/s]
Total progress:  63%|██████▎   | 1011/1600 [04:21<02:46,  3.55it/s]
Total progress:  63%|██████▎   | 1012/1600 [04:22<02:45,  3.56it/s]
Total progress:  63%|██████▎   | 1013/1600 [04:22<02:33,  3.82it/s]
Total progress:  63%|██████▎   | 1014/1600 [04:22<02:26,  4.0


Batch 2/2 completed:
 - Processed: 600 files
 - Successful: 600
 - Errors: 0
 - Cumulative saved: 2000 files
--------------------------------------------------

Feature extraction complete. Results saved to /content/drive/MyDrive/project/features/mfcc.csv





# Acoustic Features

In [None]:
SAMPLE_RATE = 16000
N_MFCC = 40
HOP_LENGTH = 512
BATCH_SIZE = 1000

def extract_acoustic_features(file_path):
    """
    Extract acoustic features from audio file using Librosa
    Returns: Dictionary of features
    """

    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    features = {}

    # 1. Time-domain features
    features['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y=y))
    features['rms_mean'] = np.mean(librosa.feature.rms(y=y))

    # 2. Spectral features
    S = librosa.magphase(librosa.stft(y=y, hop_length=HOP_LENGTH))[0]

    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(S=S)
    features['spectral_centroid_mean'] = np.mean(spectral_centroid)
    features['spectral_centroid_std'] = np.std(spectral_centroid)

    # Spectral bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(S=S)
    features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
    features['spectral_bandwidth_std'] = np.std(spectral_bandwidth)

    # Spectral rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(S=S)
    features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
    features['spectral_rolloff_std'] = np.std(spectral_rolloff)

    # # 3. MFCCs
    # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH)
    # for i in range(N_MFCC):
    #     features[f'mfcc_{i+1}_mean'] = np.mean(mfcc[i])
    #     features[f'mfcc_{i+1}_std'] = np.std(mfcc[i])

    # 4. Chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=HOP_LENGTH)
    features['chroma_mean'] = np.mean(chroma)
    features['chroma_std'] = np.std(chroma)

    # 5. Spectral contrast
    spectral_contrast = librosa.feature.spectral_contrast(S=S)
    features['spectral_contrast_mean'] = np.mean(spectral_contrast)
    features['spectral_contrast_std'] = np.std(spectral_contrast)

    # 6. Tonnetz features
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    features['tonnetz_mean'] = np.mean(tonnetz)
    features['tonnetz_std'] = np.std(tonnetz)

    # # 7. Delta features (MFCC derivatives)
    # delta_mfcc = librosa.feature.delta(mfcc)
    # delta2_mfcc = librosa.feature.delta(mfcc, order=2)

    # for i in range(N_MFCC):
    #     features[f'delta_mfcc_{i+1}_mean'] = np.mean(delta_mfcc[i])
    #     features[f'delta2_mfcc_{i+1}_mean'] = np.mean(delta2_mfcc[i])

    return features

def process_audio_files_with_batching(metadata_path, output_csv):
    """
    Process audio files in batches with progress tracking
    """
    df = pd.read_csv(metadata_path)
    total_files = len(df)
    batches = total_files // BATCH_SIZE + (1 if total_files % BATCH_SIZE else 0)
    """ all_features = []
    all_labels = []
    all_splits = []
    feature_columns = None"""
    first_batch = True
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)

    master_pbar = tqdm(total=total_files, desc="Total progress", position=0)

    for batch_num in range(batches):
        batch_start = batch_num * BATCH_SIZE
        batch_end = min((batch_num + 1) * BATCH_SIZE, total_files)
        batch_files = df.iloc[batch_start:batch_end]

        batch_features = []
        batch_errors = 0

        batch_pbar = tqdm(total=len(batch_files), desc=f"Batch {batch_num+1}/{batches}", position=1, leave=False)

        for idx, row in batch_files.iterrows():
            try:
                features = extract_acoustic_features(row['file_path'])
                features['label'] = row['label']
                features['split'] = row['split']
                batch_features.append(features)
                """if feature_columns is None:
                    feature_columns = [k for k in features.keys() if k not in ['label', 'split']]

                Append to numpy storage
                all_features.append([features[k] for k in feature_columns])
                all_labels.append(features['label'])
                all_splits.append(features['split'])"""

            except Exception as e:
                batch_errors += 1
                tqdm.write(f"Error in {row['file_path']}: {str(e)}")
            finally:
                batch_pbar.update(1)
                master_pbar.update(1)

        batch_pbar.close()

        if batch_features:
            batch_df = pd.DataFrame(batch_features)
            write_header = not os.path.exists(output_csv) or first_batch
            batch_df.to_csv(output_csv, mode='a', header=write_header, index=False)
            first_batch = False

        tqdm.write(f"\nBatch {batch_num+1}/{batches} completed:")
        tqdm.write(f" - Processed: {len(batch_files)} files")
        tqdm.write(f" - Successful: {len(batch_files) - batch_errors}")
        tqdm.write(f" - Errors: {batch_errors}")
        tqdm.write(f" - Cumulative saved: {(batch_num+1)*BATCH_SIZE} files")
        tqdm.write("-" * 50)

        time.sleep(0.1)

    master_pbar.close()
    print(f"\nFeature extraction complete. Results saved to {output_csv}")

    """ if feature_columns:
        features_array = np.array(all_features, dtype=np.float32)
        labels_array = np.array(all_labels)
        splits_array = np.array(all_splits)

        np.save('features.npy', features_array)
        np.save('labels.npy', labels_array)
        np.save('splits.npy', splits_array)
        print("\nNumPy files saved:")
        print(f" - features.npy: {features_array.shape}")
        print(f" - labels.npy: {labels_array.shape}")
        print(f" - splits.npy: {splits_array.shape}")
    else:
        print("\nWarning: No features extracted for NumPy files")

    print(f"\nCSV output saved to {output_csv}") """

if __name__ == "__main__":
    metadata_path = "/content/drive/MyDrive/project/metadata.csv"
    output_csv = "/content/drive/MyDrive/project/features/Spectral.csv"

    if os.path.exists(output_csv):
        os.remove(output_csv)
    for f in ['features.npy', 'labels.npy', 'splits.npy']:
        if os.path.exists(f):
            os.remove(f)

    process_audio_files_with_batching(metadata_path, output_csv)

Total progress:   0%|          | 0/1600 [00:00<?, ?it/s]

Total progress:   0%|          | 1/1600 [00:00<06:31,  4.08it/s]
Total progress:   0%|          | 2/1600 [00:00<06:46,  3.93it/s]
Total progress:   0%|          | 3/1600 [00:00<06:59,  3.80it/s]
Total progress:   0%|          | 4/1600 [00:01<06:59,  3.81it/s]
Total progress:   0%|          | 5/1600 [00:01<06:57,  3.82it/s]
Total progress:   0%|          | 6/1600 [00:01<08:02,  3.31it/s]
Total progress:   0%|          | 7/1600 [00:01<07:59,  3.32it/s]
Total progress:   0%|          | 8/1600 [00:02<06:53,  3.85it/s]
Total progress:   1%|          | 9/1600 [00:02<06:14,  4.24it/s]
Total progress:   1%|          | 10/1600 [00:02<05:33,  4.77it/s]
Total progress:   1%|          | 11/1600 [00:02<05:05,  5.20it/s]
  return pitch_tuning(

Total progress:   1%|          | 13/1600 [00:03<05:30,  4.81it/s]
Total progress:   1%|          | 14/1600 [00:03<06:23,  4.14it/s]
Total progress:   1%|          | 15/1600 [00:03<06:19,  4.18it/s]
Tot


Batch 1/2 completed:
 - Processed: 1000 files
 - Successful: 1000
 - Errors: 0
 - Cumulative saved: 1000 files
--------------------------------------------------



Batch 2/2:   0%|          | 0/600 [00:00<?, ?it/s][A
Total progress:  63%|██████▎   | 1001/1600 [03:30<02:37,  3.79it/s]
Total progress:  63%|██████▎   | 1002/1600 [03:31<02:17,  4.34it/s]
Total progress:  63%|██████▎   | 1003/1600 [03:31<02:06,  4.74it/s]
Total progress:  63%|██████▎   | 1004/1600 [03:31<01:55,  5.18it/s]
Total progress:  63%|██████▎   | 1005/1600 [03:31<01:47,  5.55it/s]
Total progress:  63%|██████▎   | 1006/1600 [03:31<01:40,  5.89it/s]
Total progress:  63%|██████▎   | 1007/1600 [03:31<01:39,  5.99it/s]
Total progress:  63%|██████▎   | 1008/1600 [03:32<01:36,  6.12it/s]
Total progress:  63%|██████▎   | 1009/1600 [03:32<01:36,  6.10it/s]
Total progress:  63%|██████▎   | 1010/1600 [03:32<01:35,  6.18it/s]
Total progress:  63%|██████▎   | 1011/1600 [03:32<01:32,  6.37it/s]
Total progress:  63%|██████▎   | 1012/1600 [03:32<01:30,  6.48it/s]
Total progress:  63%|██████▎   | 1013/1600 [03:32<01:30,  6.52it/s]
Total progress:  63%|██████▎   | 1014/1600 [03:32<01:31,  6.4


Batch 2/2 completed:
 - Processed: 600 files
 - Successful: 600
 - Errors: 0
 - Cumulative saved: 2000 files
--------------------------------------------------

Feature extraction complete. Results saved to /content/drive/MyDrive/project/features/Spectral.csv





In [None]:
df = pd.read_csv('/content/drive/MyDrive/project/features/spectrogram_features/spectrogram_metadata.csv')
# print(df)
column_names = pd.DataFrame(df.columns, columns=["Column Names"])
print(column_names)
# print(df.describe())
# print(df['label'].value_counts())
# print(df['split'].value_counts())

    Column Names
0  original_path
1       npy_path
2     image_path
3          label
4          split


In [None]:
try:
    df = pd.read_csv('/content/drive/MyDrive/project/features/temporal_prosodic_features.csv')
    df = df.select_dtypes(include=[np.number])  # Auto-filter numeric columns
    np.save('/content/drive/MyDrive/project/features/temporal_prosodic_features.csv', df.to_numpy())
    print("Conversion successful!")
except Exception as e:
    print(f"Error: {e}")

Conversion successful!


In [None]:
import numpy as np

data = np.load('/content/drive/MyDrive/data/features/acoustic_features.npy')

print(data)
print(data.files)
print(data.shape)
print(data.dtype)

# loaded_data = np.load('/content/drive/MyDrive/data/features/acoustic_features.npy')

# print(loaded_data.files)  # e.g., ['arr_0', 'arr_1']

# array1 = loaded_data['arr_0']
# array2 = loaded_data['arr_1']

[[ 1.40516493e-01  1.45221280e-01  2.18591683e+03 ... -2.38883980e-02
  -8.14859200e-02  0.00000000e+00]
 [ 1.00570437e-01  5.67095170e-02  2.02415159e+03 ...  3.68334760e-02
  -2.21861800e-02  0.00000000e+00]
 [ 1.25232515e-01  8.02200140e-02  2.37859848e+03 ... -6.37733900e-03
   2.13560370e-02  0.00000000e+00]
 ...
 [ 1.40640501e-01  9.51769400e-02  2.27405670e+03 ...  1.44496990e-01
  -5.96113130e-02  1.00000000e+00]
 [ 6.27867684e-02  1.14963170e-01  1.40512040e+03 ...  4.11573950e-01
  -6.05035350e-02  1.00000000e+00]
 [ 1.09708271e-01  2.12499290e-01  1.58242994e+03 ...  2.86795010e-02
   1.41147790e-01  1.00000000e+00]]


AttributeError: 'numpy.ndarray' object has no attribute 'files'

# Temporal Features

In [None]:
SAMPLE_RATE = 16000
FRAME_LENGTH = 2048
HOP_LENGTH = 512
FMIN = 80
FMAX = 400
BATCH_SIZE = 1000
def extract_temporal_features(y):
    features = {}

    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
    features['zcr_mean'] = np.mean(zcr)
    features['zcr_std'] = np.std(zcr)

    # Energy/RMS
    rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
    features['rms_mean'] = np.mean(rms)
    features['rms_std'] = np.std(rms)

    # Autocorrelation Features
    autocorr = librosa.autocorrelate(y, max_size=1000)
    features['autocorr_peak'] = np.max(autocorr[1:])  # Exclude lag 0
    for lag in [1, 10, 100]:
        features[f'autocorr_lag_{lag}'] = autocorr[lag] if lag < len(autocorr) else 0

    # Amplitude Envelope
    # amplitude_envelope = librosa.feature.rms(y=y)[0]
    # ae = librosa.amplitude_envelope(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
    # features['ae_mean'] = np.mean(ae)
    # features['ae_std'] = np.std(ae)

    return features

def extract_prosodic_features(y, sr):
    """Extract prosodic features"""
    features = {}

    # Pitch (F0) estimation using PYIN
    f0, voiced_flag, _ = librosa.pyin(y, sr=sr, fmin=FMIN, fmax=FMAX,
                                    frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)

    if np.any(voiced_flag):
        f0_voiced = f0[voiced_flag]

        # Pitch statistics
        features['pitch_mean'] = np.mean(f0_voiced)
        features['pitch_std'] = np.std(f0_voiced)
        features['pitch_range'] = np.ptp(f0_voiced)
        features['pitch_kurtosis'] = kurtosis(f0_voiced)
        features['pitch_skewness'] = skew(f0_voiced)

        # Intonation contour (linear regression coefficients)
        x = np.arange(len(f0_voiced)).reshape(-1, 1)
        model = LinearRegression().fit(x, f0_voiced)
        features['intonation_slope'] = model.coef_[0]
        features['intonation_intercept'] = model.intercept_
    else:
        features.update({k: 0 for k in ['pitch_mean', 'pitch_std', 'pitch_range',
                                      'intonation_slope', 'intonation_intercept']})

    # Pause analysis using RMS energy
    rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]
    threshold = np.percentile(rms, 10)
    silent_frames = np.where(rms < threshold)[0]

    # Pause duration features
    if len(silent_frames) > 0:
        pause_durations = np.diff(silent_frames) * HOP_LENGTH / sr
        features['pause_duration_total'] = np.sum(pause_durations)
        features['pause_count'] = len(pause_durations)
    else:
        features['pause_duration_total'] = 0
        features['pause_count'] = 0

    # Speaking rate estimation
    voiced_frames = np.sum(voiced_flag)
    total_frames = len(voiced_flag)
    features['speaking_rate'] = voiced_frames / total_frames if total_frames > 0 else 0

    return features

def extract_features(file_path):
    """Main feature extraction function"""
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    features = {}

    temporal_features = extract_temporal_features(y)
    features.update(temporal_features)

    prosodic_features = extract_prosodic_features(y, sr)
    features.update(prosodic_features)

    return features

def process_dataset_with_batching(metadata_path, output_path):
    """Process audio files in batches and save features"""
    df = pd.read_csv(metadata_path)
    total_files = len(df)
    batches = total_files // BATCH_SIZE + (1 if total_files % BATCH_SIZE else 0)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    master_pbar = tqdm(total=total_files, desc="Total progress", position=0)
    first_batch = True

    for batch_num in range(batches):
        batch_start = batch_num * BATCH_SIZE
        batch_end = min((batch_num + 1) * BATCH_SIZE, total_files)
        batch_files = df.iloc[batch_start:batch_end]

        batch_features = []
        batch_errors = 0
        batch_pbar = tqdm(total=len(batch_files), desc=f"Batch {batch_num+1}/{batches}", position=1, leave=False)

        for _, row in batch_files.iterrows():
            try:
                features = extract_features(row['file_path'])
                features['file_path'] = row['file_path']
                features['filename'] = row['filename']
                features['label'] = row['label']
                features['split'] = row['split']
                batch_features.append(features)
            except Exception as e:
                batch_errors += 1
                tqdm.write(f"Error processing {row['file_path']}: {str(e)}")
            finally:
                batch_pbar.update(1)
                master_pbar.update(1)

        batch_pbar.close()

        if batch_features:
            batch_df = pd.DataFrame(batch_features)
            write_header = not os.path.exists(output_path) or first_batch
            batch_df.to_csv(output_path, mode='a', header=write_header, index=False)
            first_batch = False

        tqdm.write(f"\nBatch {batch_num+1}/{batches} completed:")
        tqdm.write(f" - Processed: {len(batch_files)} files")
        tqdm.write(f" - Successful: {len(batch_files) - batch_errors}")
        tqdm.write(f" - Errors: {batch_errors}")
        tqdm.write(f" - Cumulative saved: {(batch_num+1)*BATCH_SIZE} files")
        tqdm.write("-" * 50)

        time.sleep(0.1)

    master_pbar.close()
    print(f"\nFeature extraction complete. Results saved to {output_path}")

if __name__ == "__main__":
    process_dataset_with_batching(
        metadata_path="/content/drive/MyDrive/project/metadata.csv",
        output_path="/content/drive/MyDrive/project/features/temporal.csv"
    )

Total progress:   0%|          | 0/1600 [00:00<?, ?it/s]
Batch 1/2:   0%|          | 0/1000 [00:00<?, ?it/s][A
Total progress:   0%|          | 1/1600 [00:00<09:54,  2.69it/s]
Total progress:   0%|          | 2/1600 [00:00<07:06,  3.75it/s]
Total progress:   0%|          | 3/1600 [00:00<06:39,  3.99it/s]
Total progress:   0%|          | 4/1600 [00:01<06:29,  4.09it/s]
Total progress:   0%|          | 5/1600 [00:01<06:30,  4.08it/s]
Total progress:   0%|          | 6/1600 [00:01<06:16,  4.24it/s]
Total progress:   0%|          | 7/1600 [00:01<06:06,  4.35it/s]
Total progress:   0%|          | 8/1600 [00:01<05:52,  4.51it/s]
Total progress:   1%|          | 9/1600 [00:02<05:39,  4.69it/s]
Total progress:   1%|          | 10/1600 [00:02<05:30,  4.81it/s]
Total progress:   1%|          | 11/1600 [00:02<05:12,  5.08it/s]
Total progress:   1%|          | 12/1600 [00:02<04:59,  5.30it/s]
Total progress:   1%|          | 13/1600 [00:02<05:09,  5.13it/s]
Total progress:   1%|          | 14/160


Batch 1/2 completed:
 - Processed: 1000 files
 - Successful: 1000
 - Errors: 0
 - Cumulative saved: 1000 files
--------------------------------------------------



Batch 2/2:   0%|          | 0/600 [00:00<?, ?it/s][A
Total progress:  63%|██████▎   | 1001/1600 [14:11<10:43,  1.07s/it]
Total progress:  63%|██████▎   | 1002/1600 [14:12<10:17,  1.03s/it]
Total progress:  63%|██████▎   | 1003/1600 [14:13<09:35,  1.04it/s]
Total progress:  63%|██████▎   | 1004/1600 [14:14<09:17,  1.07it/s]
Total progress:  63%|██████▎   | 1005/1600 [14:15<09:08,  1.08it/s]
Total progress:  63%|██████▎   | 1006/1600 [14:15<08:41,  1.14it/s]
Total progress:  63%|██████▎   | 1007/1600 [14:16<08:14,  1.20it/s]
Total progress:  63%|██████▎   | 1008/1600 [14:17<08:34,  1.15it/s]
Total progress:  63%|██████▎   | 1009/1600 [14:18<08:35,  1.15it/s]
Total progress:  63%|██████▎   | 1010/1600 [14:19<08:27,  1.16it/s]
Total progress:  63%|██████▎   | 1011/1600 [14:20<08:15,  1.19it/s]
Total progress:  63%|██████▎   | 1012/1600 [14:20<07:58,  1.23it/s]
Total progress:  63%|██████▎   | 1013/1600 [14:21<08:02,  1.22it/s]
Total progress:  63%|██████▎   | 1014/1600 [14:23<10:03,  1.0


Batch 2/2 completed:
 - Processed: 600 files
 - Successful: 600
 - Errors: 0
 - Cumulative saved: 2000 files
--------------------------------------------------

Feature extraction complete. Results saved to /content/drive/MyDrive/project/features/temporal.csv





# wev2vec2

In [None]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model

class Wav2Vec2EmbeddingExtractor:
    def __init__(self, model_name="facebook/wav2vec2-base-960h", device="cuda"):
        self.device = device if torch.cuda.is_available() else "cpu"
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2Model.from_pretrained(model_name).to(self.device)
        self.sample_rate = 16000

    def load_audio(self, file_path):
        y, sr = librosa.load(file_path, sr=self.sample_rate)
        return y

    def extract_embeddings(self, audio_array):
        inputs = self.processor(
            audio_array,
            sampling_rate=self.sample_rate,
            return_tensors="pt",
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)

        # Use last hidden state and average over time dimension
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        return embeddings.squeeze()

    def process_file(self, file_path):
        """Process single audio file"""
        audio = self.load_audio(file_path)
        return self.extract_embeddings(audio)

def extract_wav2vec_features(metadata_path, output_dir, batch_size=16):
    os.makedirs(output_dir, exist_ok=True)
    embeddings_dir = os.path.join(output_dir, "embeddings")
    os.makedirs(embeddings_dir, exist_ok=True)

    extractor = Wav2Vec2EmbeddingExtractor()

    df = pd.read_csv(metadata_path)
    results = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            file_path = row['file_path']
            embeddings = extractor.process_file(file_path)

            filename = os.path.basename(file_path).split('.')[0]
            save_path = os.path.join(embeddings_dir, f"{filename}.npy")
            np.save(save_path, embeddings)

            results.append({
                'file_path': file_path,
                'embedding_path': save_path,
                'label': row['label'],
                'split': row['split']
            })
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")

    metadata_df = pd.DataFrame(results)
    metadata_df.to_csv(os.path.join(output_dir, "wav2vec_metadata.csv"), index=False)
    return metadata_df

if __name__ == "__main__":
    config = {
        "metadata_path": "/content/drive/MyDrive/project/metadata.csv",
        "output_dir": "/content/drive/MyDrive/project/features/wav2vec2_embeddings",
        "batch_size": 32
    }

    extract_wav2vec_features(**config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1600/1600 [16:11<00:00,  1.65it/s]


# spectogram

In [None]:
SAMPLE_RATE = 16000
N_FFT = 512
HOP_LENGTH = 256
N_MELS = 128
DURATION = 2.0
IMG_SIZE = (224, 224)
DPI = 100

def create_spectrogram(audio_path, output_base_path):
    try:
        y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT,
                                         hop_length=HOP_LENGTH, n_mels=N_MELS)
        log_S = librosa.power_to_db(S, ref=np.max)

        os.makedirs(os.path.dirname(output_base_path), exist_ok=True)

        np.save(f"{output_base_path}.npy", log_S)

        plt.figure(figsize=(IMG_SIZE[0]/DPI, IMG_SIZE[1]/DPI), dpi=DPI)
        librosa.display.specshow(log_S, sr=sr, hop_length=HOP_LENGTH,
                               x_axis='time', y_axis='mel')
        plt.axis('off')
        plt.savefig(f"{output_base_path}.png", bbox_inches='tight', pad_inches=0)
        plt.close()
        return True
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return False

def process_spectrograms(metadata_path, output_root):
    df = pd.read_csv(metadata_path)
    results = []

    LABEL_MAP = {0: 'real', 1: 'fake'}

    df['full_path'] = df['file_path'].apply(lambda x: os.path.abspath(x))

    for split in ['train', 'val', 'test']:
        for label in LABEL_MAP.values():
            os.makedirs(os.path.join(output_root, split, label), exist_ok=True)

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        try:
            full_audio_path = row['full_path']
            if not os.path.exists(full_audio_path):
                raise FileNotFoundError(f"Audio file not found: {full_audio_path}")

            label = LABEL_MAP[row['label']]
            split = row['split']

            filename = os.path.basename(full_audio_path).split('.')[0]
            output_base = os.path.join(output_root, split, label, filename)

            if create_spectrogram(full_audio_path, output_base):
                results.append({
                    'original_path': full_audio_path,
                    'npy_path': f"{output_base}.npy",
                    'image_path': f"{output_base}.png",
                    'label': label,
                    'split': split
                })
        except Exception as e:
            print(f"Skipping {row['file_path']}: {str(e)}")

    output_metadata_path = os.path.join(output_root, 'spectrogram_metadata.csv')
    pd.DataFrame(results).to_csv(output_metadata_path, index=False)
    print(f"\nSuccessfully processed {len(results)}/{len(df)} files")

if __name__ == "__main__":
    METADATA_PATH = "/content/drive/MyDrive/project/metadata.csv"
    OUTPUT_ROOT = "/content/drive/MyDrive/project/features/spectrogram_features"

    process_spectrograms(METADATA_PATH, OUTPUT_ROOT)

Processing: 100%|██████████| 1600/1600 [03:39<00:00,  7.30it/s]


Successfully processed 1600/1600 files





# raw

In [None]:
import csv

# Open the CSV file
with open('/content/drive/MyDrive/data/features/acoustic_features.csv', mode='r') as file:
    csv_reader = csv.reader(file)

    # Extract and print the header (first row)
    headers = next(csv_reader)
    print("CSV Headers:", headers)


CSV Headers: ['zcr_mean', 'rms_mean', 'spectral_centroid_mean', 'spectral_centroid_std', 'spectral_bandwidth_mean', 'spectral_bandwidth_std', 'spectral_rolloff_mean', 'spectral_rolloff_std', 'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std', 'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std', 'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std', 'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std', 'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std', 'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean', 'mfcc_13_std', 'mfcc_14_mean', 'mfcc_14_std', 'mfcc_15_mean', 'mfcc_15_std', 'mfcc_16_mean', 'mfcc_16_std', 'mfcc_17_mean', 'mfcc_17_std', 'mfcc_18_mean', 'mfcc_18_std', 'mfcc_19_mean', 'mfcc_19_std', 'mfcc_20_mean', 'mfcc_20_std', 'mfcc_21_mean', 'mfcc_21_std', 'mfcc_22_mean', 'mfcc_22_std', 'mfcc_23_mean', 'mfcc_23_std', 'mfcc_24_mean', 'mfcc_24_std', 'mfcc_25_mean', 'mfcc_25_std', 'mfcc_26_mean', 'mfcc_26_std', 'mfcc_27_mean', 'mfcc

In [None]:
import numpy as np

# Load .npy file
data = np.load("/content/drive/MyDrive/project/features/wav2vec2_embeddings/embeddings/fake_auidofile_001.npy")

if data.dtype.names:
    print("Column names:", data.dtype.names)
else:
    print("No column names found. Data might be a regular NumPy array.")
print(data.shape)  # Useful for debugging and understanding data structure
# Display contents
print(data)


No column names found. Data might be a regular NumPy array.
(768,)
[-4.74379472e-02 -3.07522174e-02 -1.08682953e-01 -8.14765692e-02
  1.20355517e-01 -7.83930793e-02  3.10691446e-02 -2.86291204e-02
  1.05977781e-01 -1.85622245e-01 -3.16884257e-02  3.57182464e-03
  5.25382310e-02  2.04946063e-02  2.96878349e-02 -7.96278864e-02
 -3.19083422e-01  2.44651332e-01  2.92555690e-02  4.66702431e-02
 -1.81778952e-01 -9.55576636e-03  4.74340856e-01  1.44042419e-02
 -3.75416689e-02 -5.97259365e-02 -2.93933839e-01  2.28721742e-02
 -2.00615842e-02 -1.42160535e-01  1.44390479e-01 -8.44883360e-03
 -3.54958791e-03 -8.81628469e-02 -2.51110554e-01  1.78400129e-01
 -5.02005816e-02 -1.06683791e-01 -1.21191747e-01  1.14302248e-01
 -1.50113091e-01 -1.95147961e-01 -1.03774175e-01  1.55207068e-01
 -1.87137857e-01 -1.78701639e-01 -2.15686690e-02 -4.32625040e-02
 -5.95304510e-03  2.39631552e-02 -5.64083531e-02  8.15982558e-03
  5.89641072e-02  6.97425902e-02 -5.59270615e-03 -4.43193316e-02
  6.27297759e-02 -3.757