In [1]:
!pip install pydub
# !pip install librosa
# !pip install soundfile
# !pip install resemblyzer
# !apt install -y ffmpeg

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import numpy as np
import soundfile as sf
from tqdm import tqdm
from multiprocessing import Pool
from pydub import AudioSegment
from pydub.silence import split_on_silence
import pandas as pd
import datetime
import csv
import shutil
from sklearn.model_selection import train_test_split
import mimetypes

# Reaname files

In [None]:
real = '/content/drive/MyDrive/data/raw/real'
fake = '/content/drive/MyDrive/data/raw/fake'

def rename(directory, prefix):
  files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
  num_files = len(files)
  padding = len(str(num_files))

  files.sort()

  for idx, filename in enumerate(files, start=1):
    ext = os.path.splitext(filename)[1]

    #base, ext=os.path.splitext(filename)

    newName = f'{prefix}_{str(idx).zfill(padding)}{ext}'
    oldPath = os.path.join(directory, filename)
    newPath = os.path.join(directory, newName)

    os.rename(oldPath, newPath)
    #print(f"renamed:{filename} - {newName}")

    if idx % 1000==0:
      print(f'renamed{idx} files in {directory}')

print("Staring renaming real fils...")
rename(real, 'real')
print("completed!\n")

print("Staring renaming real fils...")
rename(fake, 'fake')
print("completed!\n")

Staring renaming real fils...
renamed1000 files in /content/drive/MyDrive/data/chunks/real
renamed2000 files in /content/drive/MyDrive/data/chunks/real
renamed3000 files in /content/drive/MyDrive/data/chunks/real
renamed4000 files in /content/drive/MyDrive/data/chunks/real
renamed5000 files in /content/drive/MyDrive/data/chunks/real
renamed6000 files in /content/drive/MyDrive/data/chunks/real
renamed7000 files in /content/drive/MyDrive/data/chunks/real
renamed8000 files in /content/drive/MyDrive/data/chunks/real
completed!

Staring renaming real fils...
renamed1000 files in /content/drive/MyDrive/data/chunks/fake
renamed2000 files in /content/drive/MyDrive/data/chunks/fake
renamed3000 files in /content/drive/MyDrive/data/chunks/fake
renamed4000 files in /content/drive/MyDrive/data/chunks/fake
renamed5000 files in /content/drive/MyDrive/data/chunks/fake
renamed6000 files in /content/drive/MyDrive/data/chunks/fake
renamed7000 files in /content/drive/MyDrive/data/chunks/fake
renamed8000 f

# load files and display

In [None]:
data='/content/drive/MyDrive/sample_data/processed'
real = '/content/drive/MyDrive/sample_data/processed/real'
fake = '/content/drive/MyDrive/sample_data/processed/fake'

def load_display(directory, num_example=2):
  plt.figure(figsize=(15,10))

  files= sorted(os.listdir(directory))[:num_example]

  for i, filename in enumerate(files):
    file_path = os.path.join(directory, filename)
    try:
      #loadfiles
      y, sr=librosa.load(file_path, sr=None)

      #display wave
      plt.subplot(num_example, 1, i+1)
      librosa.display.waveshow(y, sr=sr)
      plt.title(f'{os.path.basename(directory)}-{filename}')

      print(f'Play{filename}')
      display(Audio(data=y, rate=sr))
    except Exception as e:
      print(f'Error wiht{filename}: {str(e)}')
  plt.tight_layout()
  plt.show()

print('real audio samples')
load_display(real)

print('fake audio samples')
load_display(fake)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
data='/content/drive/MyDrive/sample_data/converted'
real = '/content/drive/MyDrive/sample_data/converted/real'
fake = '/content/drive/MyDrive/sample_data/converted/fake'
sample_rate = 16000
Sample_display=3

def load_visualize(folder_path, label, num_samples=1):
  files = [f for f in os.listdir(folder_path) if f.endswith(('.wav', '.mp3', '.flac'))]

  selectedFiles = np.random.choice(files, size=min(num_samples, len(files)), replace=False)

  for idx, file in enumerate(selectedFiles):
    try:
      file_path= os.path.join(folder_path, file)
      audio, sr = librosa.load(file_path, sr=sample_rate, duration=10)

      plt.figure(figsize=(15,5))

      plt.subplot(1,2,1)
      librosa.display.waveshow(audio, sr=sr)
      plt.title(f'waveform-{label}\n{file}')
      plt.xlabel("time")
      plt.ylabel('amplitude')

      plt.subplot(1,2,2)
      X=librosa.stft(audio)
      Xdb=librosa.amplitude_to_db(abs(X))
      librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
      plt.colorbar(format='%+2.0f db')
      plt.title(f'Spectogram - {label}\n{file}')
      plt.tight_layout()
      plt.show

      print(f'playing {label} audio:{file}')
      display(Audio(audio, rate=sr))

    except Exception as e:
      print(f'error processing {file}:{str(e)}')

print('='*60)
print('visulizing real audio files')
print('='*60)
load_visualize(real, 'Real', num_samples=Sample_display)

print('='*60)
print('visulizing fake audio files')
print('='*60)
load_visualize(fake, 'Fake', num_samples=Sample_display)

Output hidden; open in https://colab.research.google.com to view.

# Convert Format

In [None]:
inputReal = '/content/drive/MyDrive/data/raw/real'
inputFake = '/content/drive/MyDrive/data/raw/fake'
outputReal = '/content/drive/MyDrive/data/converted/real'
outputFake = '/content/drive/MyDrive/data/converted/fake'

target_sr = 16000
processes = 4

def convert_file(args):
    """Fixed variable name and logic"""
    input_path, output_path = args
    try:
        if os.path.exists(output_path):
            return

        # Load and convert audio
        y, _ = librosa.load(input_path, sr=target_sr, mono=True)
        sf.write(output_path, y, target_sr, subtype='PCM_16')

    except Exception as e:
        print(f'Error processing {input_path}: {str(e)}')

def process_directory(input_dir, output_dir):
    file_pairs = []
    os.makedirs(output_dir, exist_ok=True)

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.split(".")[-1].lower() in ['mp3', 'wav', 'ogg', 'flac']:
                input_path = os.path.join(root, file)
                rel_path = os.path.relpath(input_path, input_dir)
                output_path = os.path.join(output_dir, rel_path)
                output_path = os.path.splitext(output_path)[0] + '.wav'
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                file_pairs.append((input_path, output_path))


    with Pool(processes) as pool:
        list(tqdm(pool.imap(convert_file, file_pairs),
                 total=len(file_pairs),
                 desc=f'Processing {os.path.basename(input_dir)}'))

if __name__ == '__main__':
    print('Converting real audio files...')
    process_directory(inputReal, outputReal)

    print('\nConverting fake audio files...')
    process_directory(inputFake, outputFake)

    print('\nConversion complete!')

Converting real audio files...


Processing real: 100%|██████████| 3004/3004 [02:09<00:00, 23.24it/s]



Converting fake audio files...


  y, _ = librosa.load(input_path, sr=target_sr, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing /content/drive/MyDrive/data/raw/fake/fake_3070.mp3: 


Processing fake: 100%|██████████| 3506/3506 [02:06<00:00, 27.78it/s]


Conversion complete!





# Remove Silence

In [None]:
!pip install --upgrade pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
INPUT_REAL = '/content/drive/MyDrive/data/converted/real'
INPUT_FAKE = '/content/drive/MyDrive/data/converted/fake'
OUTPUT_REAL = '/content/drive/MyDrive/data/processed/real'
OUTPUT_FAKE = '/content/drive/MyDrive/data/processed/fake'
PROCESSES = 4

"""Silence removal parameters (adjust these based on your needs)
ms (minimum duration of silence to split on)
ms (minimum duration of silence to split on)
ms (analysis window size)
ms (silence to leave at start/end of chunks)
ms (safety margin around speech segments)"""
SILENCE_THRESH = -40
MIN_SILENCE_LEN = 300
CHUNK_SIZE = 10
KEEP_SILENCE = 50
BUFFER_MS = 100

def process_file(args):
    """Process single audio file with enhanced error handling"""
    input_path, output_path = args
    try:
        # Skip existing files
        if os.path.exists(output_path):
            return

        audio = AudioSegment.from_file(input_path)

        # Split on silence with aggressive parameters
        chunks = split_on_silence(
            audio,
            silence_thresh=SILENCE_THRESH,
            min_silence_len=MIN_SILENCE_LEN,
            keep_silence=KEEP_SILENCE,
            seek_step=CHUNK_SIZE
        )

        # Merge chunks with buffer
        processed = AudioSegment.empty()
        for chunk in chunks:
            # Add buffer before and after each chunk
            processed += chunk[-BUFFER_MS:] if len(processed) > 0 else chunk
            processed += chunk
            processed += chunk[:BUFFER_MS]

        # Remove buffers from start/end
        processed = processed[BUFFER_MS:-BUFFER_MS] if len(processed) > 2*BUFFER_MS else processed

        processed.export(output_path,
                       format="wav",
                       parameters=["-ac", "1", "-ar", "16000", "-sample_fmt", "s16"]
        )

    except Exception as e:
        print(f"Error processing {input_path}: {str(e)}")

def process_directory(input_dir, output_dir):
    """Process directory with structure preservation"""
    file_pairs = []

    os.makedirs(output_dir, exist_ok=True)

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith(('.wav', '.mp3', '.ogg', '.flac')):
                input_path = os.path.join(root, file)
                rel_path = os.path.relpath(input_path, input_dir)
                output_path = os.path.join(output_dir, rel_path)
                output_path = os.path.splitext(output_path)[0] + ".wav"
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                file_pairs.append((input_path, output_path))

    with Pool(PROCESSES) as pool:
        list(tqdm(pool.imap(process_file, file_pairs),
                total=len(file_pairs),
                desc=f"Processing {os.path.basename(input_dir)}"))

if __name__ == "__main__":
    print("Processing real audio...")
    process_directory(INPUT_REAL, OUTPUT_REAL)

    print("\nProcessing fake audio...")
    process_directory(INPUT_FAKE, OUTPUT_FAKE)

    print("\nSilence removal complete!")

Processing real audio...


Processing real:  37%|███▋      | 1107/3004 [02:41<04:29,  7.04it/s]

Error processing /content/drive/MyDrive/data/converted/real/real_2923.wav: You should never be filling in    more than 2 ms with silence here, missing frames: 1024


Processing real:  43%|████▎     | 1293/3004 [03:08<03:31,  8.08it/s]

Error processing /content/drive/MyDrive/data/converted/real/real_0044.wav: You should never be filling in    more than 2 ms with silence here, missing frames: 1120


Processing real:  54%|█████▍    | 1622/3004 [03:54<02:47,  8.25it/s]

Error processing /content/drive/MyDrive/data/converted/real/real_0794.wav: You should never be filling in    more than 2 ms with silence here, missing frames: 1520


Processing real:  94%|█████████▎| 2810/3004 [06:44<00:25,  7.63it/s]

Error processing /content/drive/MyDrive/data/converted/real/real_0405.wav: You should never be filling in    more than 2 ms with silence here, missing frames: 1280


Processing real: 100%|██████████| 3004/3004 [07:11<00:00,  6.96it/s]



Processing fake audio...


Processing fake: 100%|██████████| 3505/3505 [08:32<00:00,  6.84it/s]



Silence removal complete!


# Segmentation

In [None]:
input_real = '/content/drive/MyDrive/data/processed/real'
input_fake = '/content/drive/MyDrive/data/processed/fake'
output_real = '/content/drive/MyDrive/data/chunks/real'
output_fake = '/content/drive/MyDrive/data/chunks/fake'
processes = 4

"""Segmentation parameters
Seconds per chunk
Overlap between chunks (seconds)
Must match your audio files
Minimum chunk length to keep (seconds)
Pad short files with silence"""
chunk_length = 2.0
overlap = 0.5
sample_rate = 16000
min_chunk_length = 1.0
pad_short = True

def segment_audio(args):
    """Segment a single audio file into chunks"""
    input_path, output_dir = args
    try:

        y, sr = librosa.load(input_path, sr=sample_rate)


        samples_per_chunk = int(chunk_length * sr)
        samples_overlap = int(overlap * sr)
        step_size = samples_per_chunk - samples_overlap


        chunks = []
        start = 0
        while start + samples_per_chunk <= len(y):
            chunks.append(y[start:start+samples_per_chunk])
            start += step_size

        # Handle remaining audio
        remaining = len(y) - start
        if remaining > 0:
            if pad_short and remaining < samples_per_chunk:
                pad_size = samples_per_chunk - remaining
                last_chunk = np.pad(y[start:], (0, pad_size), mode='constant')
                chunks.append(last_chunk)
            elif remaining >= min_chunk_length * sr:
                chunks.append(y[start:])

        base_name = os.path.splitext(os.path.basename(input_path))[0]
        for i, chunk in enumerate(chunks):
            # Skip chunks below minimum length
            if len(chunk)/sr < min_chunk_length:
                continue

            output_path = os.path.join(output_dir, f"{base_name}_{i:04d}.wav")
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            sf.write(output_path, chunk, sr, subtype='PCM_16')

        return len(chunks)

    except Exception as e:
        print(f"Error processing {input_path}: {str(e)}")
        return 0

def process_directory(input_dir, output_dir):
    """Process all files in directory with batch processing"""
    file_pairs = []

    os.makedirs(output_dir, exist_ok=True)

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.wav'):
                input_path = os.path.join(root, file)
                file_pairs.append((input_path, output_dir))

    num_batches = processes * 4
    batch_size = max(1, len(file_pairs) // num_batches)
    batches = [file_pairs[i:i+batch_size] for i in range(0, len(file_pairs), batch_size)]

    total_chunks = 0
    with Pool(processes) as pool:
        for batch in tqdm(
            batches,
            desc=f'Processing {os.path.basename(input_dir)} batches',
            total=len(batches)
        ):
            results = list(tqdm(
                pool.imap(segment_audio, batch),
                total=len(batch),
                desc='  Current batch',
                leave=False
            ))
            total_chunks += sum(results)

    print(f"Created {total_chunks} chunks from {len(file_pairs)} files")


if __name__ == '__main__':
    print("Segmenting real audio files...")
    process_directory(input_real, output_real)

    print("\nSegmenting fake audio files...")
    process_directory(input_fake, output_fake)

    print("\nSegmentation complete!")

Segmenting real audio files...


Processing real batches:   0%|          | 0/17 [00:00<?, ?it/s]
  Current batch:   0%|          | 0/187 [00:00<?, ?it/s][A
  Current batch:   1%|          | 1/187 [00:00<01:24,  2.21it/s][A
  Current batch:   3%|▎         | 6/187 [00:00<00:16, 10.94it/s][A
  Current batch:   4%|▍         | 8/187 [00:01<00:22,  7.98it/s][A
  Current batch:   8%|▊         | 15/187 [00:01<00:12, 13.94it/s][A
  Current batch:  10%|▉         | 18/187 [00:01<00:11, 14.84it/s][A
  Current batch:  11%|█         | 20/187 [00:01<00:12, 13.80it/s][A
  Current batch:  13%|█▎        | 24/187 [00:01<00:11, 14.15it/s][A
  Current batch:  13%|█▎        | 24/187 [00:18<00:11, 14.15it/s][A
  Current batch:  16%|█▌        | 29/187 [00:44<09:05,  3.45s/it][A
  Current batch:  18%|█▊        | 34/187 [00:44<05:38,  2.21s/it][A
  Current batch:  19%|█▉        | 36/187 [00:44<04:38,  1.84s/it][A
  Current batch:  20%|██        | 38/187 [00:45<03:42,  1.50s/it][A
  Current batch:  22%|██▏       | 41/187 [00:45<02:

Created 17760 chunks from 3000 files

Segmenting fake audio files...



Processing fake batches:   0%|          | 0/17 [00:00<?, ?it/s]
  Current batch:   0%|          | 0/219 [00:00<?, ?it/s][A
  Current batch:   0%|          | 1/219 [00:00<00:33,  6.52it/s][A
  Current batch:   2%|▏         | 5/219 [00:00<00:11, 18.32it/s][A
  Current batch:   4%|▎         | 8/219 [00:00<00:10, 19.61it/s][A
  Current batch:   5%|▌         | 11/219 [00:00<00:10, 19.96it/s][A
  Current batch:   6%|▋         | 14/219 [00:00<00:11, 17.09it/s][A
  Current batch:   9%|▊         | 19/219 [00:00<00:08, 22.70it/s][A
  Current batch:  11%|█         | 24/219 [00:01<00:08, 23.94it/s][A
  Current batch:  12%|█▏        | 27/219 [00:01<00:08, 21.54it/s][A
  Current batch:  14%|█▎        | 30/219 [00:01<00:08, 22.05it/s][A
  Current batch:  15%|█▌        | 33/219 [00:02<00:17, 10.61it/s][A
  Current batch:  15%|█▌        | 33/219 [00:20<00:17, 10.61it/s][A
  Current batch:  16%|█▌        | 34/219 [00:56<19:50,  6.43s/it][A
  Current batch:  16%|█▋        | 36/219 [00:56<14

Created 8995 chunks from 3505 files

Segmentation complete!





# CSV

In [None]:
Real ='/content/drive/MyDrive/data/chunks/real'
Fake ='/content/drive/MyDrive/data/chunks/fake'
Metadata_CSV = '/content/drive/MyDrive/data/metadata.csv'
processes = 4
sample_rate=16000

csv_columns=[
    'filename', 'file_path', 'label', 'duration', 'sample_rate',
    'num_channels', 'bit_depth', 'format', 'spectral_centroid',
    'zero_crossing_rate', 'rms_energy', 'timestamp', 'notes'
]

def get_properties(file_path):
  try:
    y, sr = librosa.load(file_path, sr=sample_rate, mono=False, duration=10)

    duration=librosa.get_duration(y=y, sr=sr)
    num_channels= 1 if y.ndim == 1 else y.shape[0]

    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y).mean()
    rms_energy = librosa.feature.rms(y=y).mean()

    return {
        'duration':duration,
        'sample_rate':sr,
        'num_channels':num_channels,
        'spectral_centroid':spectral_centroid,
        'zero_crossing_rate':zero_crossing_rate,
        'rms_energy':rms_energy,
        'notes':'success'
    }
  except Exception as e:
    return{'notes':f'errorL{str(e)}'}

import mimetypes

def process_file(args):
  file_path, label = args
  try:
    metadata = {
        'filename': os.path.basename(file_path),
        'file_path': file_path,
        'label': label,
        'timestamp': datetime.datetime.now().isoformat(),
    }

    audio_props = get_properties(file_path)
    metadata.update(audio_props)

    mime_type, encoding = mimetypes.guess_type(file_path)
    if mime_type:
        format = mime_type.split('/')[1]
    else:
        format = 'unknown'

    metadata['format'] = format

    with open(file_path, 'rb') as f:
      f.seek(34, 0)
      metadata['bit_depth'] = int.from_bytes(f.read(2), byteorder="little")

    return metadata

  except Exception as e:
    return {
        'filename': os.path.basename(file_path),
        'file_path': file_path,
        'label': label,
        'notes': f'critical error: {str(e)}'
    }
def generate_metadata():
  files = []
  for label, directory in [('0', Real), ('1', Fake)]:
    for filename in os.listdir(directory):
      files.append((os.path.join(directory, filename), label))

  with open(Metadata_CSV, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()

  with Pool(processes) as pool:
    for batch in tqdm(
        [files[i::processes*4] for i in range(processes*4)],
        desc='processing batches',
        total=processes*4
    ):
        results = list(tqdm(
            pool.imap(process_file, batch),
            total=len(batch),
            desc='current batch',
            leave =False
        ))

        with open(Metadata_CSV, 'a', newline='') as f :
          writer = csv.DictWriter(f, fieldnames=csv_columns)
          writer.writerows(results)

  print(f'Metadata genration complete. saved to {Metadata_CSV}')

if __name__== '__main__':
    generate_metadata()

processing batches:   0%|          | 0/16 [00:00<?, ?it/s]
current batch:   0%|          | 0/1125 [00:00<?, ?it/s][A
current batch:   0%|          | 1/1125 [00:00<06:41,  2.80it/s][A
current batch:   2%|▏         | 20/1125 [00:00<00:20, 53.78it/s][A
current batch:   3%|▎         | 30/1125 [00:00<00:16, 65.41it/s][A
current batch:   4%|▎         | 40/1125 [00:00<00:14, 74.96it/s][A
current batch:   5%|▍         | 53/1125 [00:00<00:12, 87.81it/s][A
current batch:   6%|▌         | 65/1125 [00:00<00:13, 78.09it/s][A
current batch:   7%|▋         | 75/1125 [00:01<00:14, 73.77it/s][A
current batch:   8%|▊         | 86/1125 [00:01<00:12, 80.29it/s][A
current batch:   9%|▊         | 96/1125 [00:01<00:12, 83.25it/s][A
current batch:  10%|▉         | 108/1125 [00:01<00:11, 89.36it/s][A
current batch:  10%|█         | 118/1125 [00:01<00:11, 90.96it/s][A
current batch:  11%|█▏        | 129/1125 [00:01<00:10, 94.32it/s][A
current batch:  12%|█▏        | 139/1125 [00:01<00:11, 89.12it/s

Metadata genration complete. saved to /content/drive/MyDrive/data/metadata.csv





In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/metadata_split.csv')
print(df)
print(df.describe())
print(df['label'].value_counts())
print(df['split'].value_counts())
print(df['notes'].value_counts())

            filename                                          file_path  \
0      real_7573.wav  /content/drive/MyDrive/data/split/train/real/r...   
1      real_7596.wav  /content/drive/MyDrive/data/split/train/real/r...   
2      real_7621.wav  /content/drive/MyDrive/data/split/train/real/r...   
3      real_7643.wav  /content/drive/MyDrive/data/split/train/real/r...   
4      real_7669.wav  /content/drive/MyDrive/data/split/train/real/r...   
...              ...                                                ...   
17985  fake_1974.wav  /content/drive/MyDrive/data/split/val/fake/fak...   
17986  fake_2050.wav  /content/drive/MyDrive/data/split/val/fake/fak...   
17987  fake_2173.wav  /content/drive/MyDrive/data/split/val/fake/fak...   
17988  fake_2344.wav  /content/drive/MyDrive/data/split/val/fake/fak...   
17989  fake_2456.wav  /content/drive/MyDrive/data/split/val/fake/fak...   

       label  split format  duration  sample_rate  num_channels  bit_depth  \
0          0  train  

# Split train test validate

In [None]:
data_dir = '/content/drive/MyDrive/data/chunks'
real_dir = os.path.join(data_dir, 'real')
fake_dir = os.path.join(data_dir, 'fake')

train_dir = '/content/drive/MyDrive/data/split/train'
test_dir = '/content/drive/MyDrive/data/split/test'
val_dir = '/content/drive/MyDrive/data/split/val'

for split_dir in [train_dir, test_dir, val_dir]:
    os.makedirs(split_dir, exist_ok=True)
    os.makedirs(os.path.join(split_dir, 'real'), exist_ok=True)
    os.makedirs(os.path.join(split_dir, 'fake'), exist_ok=True)

def split_data(category_dir, category_name):
    files = [f for f in os.listdir(category_dir) if os.path.isfile(os.path.join(category_dir, f))]

    train_files, temp_files = train_test_split(files, test_size=0.3, random_state=42)  # 70% train, 30% for test+validation
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)  # Split the 30% into 15% test, 15% validation


    for file in train_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(train_dir, category_name, file))
    for file in test_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(test_dir, category_name, file))
    for file in val_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(val_dir, category_name, file))

split_data(real_dir, 'real')
split_data(fake_dir, 'fake')

print("Data split completed!")


Data split completed!


# csv file of splited data

In [None]:
DATASET_DIR = '/content/drive/MyDrive/data/split'
Metadata_CSV = '/content/drive/MyDrive/data/metadata_split.csv'
processes = 4
sample_rate = 16000

csv_columns = [
    'filename', 'file_path', 'label', 'split', 'format', 'duration', 'sample_rate',
    'num_channels', 'bit_depth', 'timestamp', 'notes'
]

def get_properties(file_path):
    try:
        y, sr = librosa.load(file_path, sr=sample_rate, mono=False, duration=10)
        duration = librosa.get_duration(y=y, sr=sr)
        num_channels = 1 if y.ndim == 1 else y.shape[0]

        return {
            'duration': duration,
            'sample_rate': sr,
            'num_channels': num_channels,
            'notes': 'success'
        }
    except Exception as e:
        return {'notes': f'error: {str(e)}'}

def process_file(args):
    file_path, label, split = args
    try:
        metadata = {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'label': label,
            'timestamp': datetime.datetime.now().isoformat(),
            'split': split
        }

        audio_props = get_properties(file_path)
        metadata.update(audio_props)

        mime_type, _ = mimetypes.guess_type(file_path)
        metadata['format'] = mime_type.split('/')[1] if mime_type else 'unknown'

        with open(file_path, 'rb') as f:
            f.seek(34, 0)
            metadata['bit_depth'] = int.from_bytes(f.read(2), byteorder="little")

        return metadata
    except Exception as e:
        return {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'label': label,
            'notes': f'critical error: {str(e)}',
            'split': split
        }

def generate_metadata():
    files = []
    splits = ['train', 'test', 'val']

    for split in splits:
        for label, folder_name in [('0', 'real'), ('1', 'fake')]:
            directory = os.path.join(DATASET_DIR, split, folder_name)
            if os.path.exists(directory):
                for filename in os.listdir(directory):
                    file_path = os.path.join(directory, filename)
                    files.append((file_path, label, split))

    with open(Metadata_CSV, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=csv_columns)
        writer.writeheader()

    with Pool(processes) as pool:
        for batch in tqdm(
            [files[i::processes * 4] for i in range(processes * 4)],
            desc='Processing batches',
            total=processes * 4
        ):
            results = list(tqdm(
                pool.imap(process_file, batch),
                total=len(batch),
                desc='Current batch',
                leave=False
            ))

            with open(Metadata_CSV, 'a', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=csv_columns)
                writer.writerows(results)

    print(f'Metadata generation complete. Saved to {Metadata_CSV}')

if __name__ == '__main__':
    generate_metadata()


Processing batches:   0%|          | 0/16 [00:00<?, ?it/s]
Current batch:   0%|          | 0/1125 [00:00<?, ?it/s][A
Current batch:   0%|          | 1/1125 [00:03<1:06:22,  3.54s/it][A
Current batch:   0%|          | 2/1125 [00:04<33:03,  1.77s/it]  [A
Current batch:   5%|▌         | 59/1125 [00:04<00:40, 26.34it/s][A
Current batch:   7%|▋         | 84/1125 [00:04<00:26, 39.22it/s][A
Current batch:  10%|▉         | 107/1125 [00:04<00:19, 52.79it/s][A
Current batch:  11%|█▏        | 128/1125 [00:04<00:15, 66.01it/s][A
Current batch:  13%|█▎        | 147/1125 [00:04<00:12, 80.33it/s][A
Current batch:  15%|█▍        | 166/1125 [00:04<00:10, 89.99it/s][A
Current batch:  16%|█▋        | 183/1125 [00:04<00:09, 102.18it/s][A
Current batch:  18%|█▊        | 200/1125 [00:05<00:08, 114.43it/s][A
Current batch:  19%|█▉        | 217/1125 [00:05<00:07, 123.76it/s][A
Current batch:  21%|██        | 236/1125 [00:05<00:06, 134.13it/s][A
Current batch:  22%|██▏       | 253/1125 [00:05<00:0

Metadata generation complete. Saved to /content/drive/MyDrive/data/metadata_split.csv





# raw

In [4]:
#rename
real = '/content/drive/MyDrive/project/raw/real'
fake = '/content/drive/MyDrive/project/raw/fake'

def rename(directory, prefix):
  files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
  num_files = len(files)
  padding = len(str(num_files))

  files.sort()

  for idx, filename in enumerate(files, start=1):
    ext = os.path.splitext(filename)[1]

    #base, ext=os.path.splitext(filename)

    newName = f'{prefix}_{str(idx).zfill(padding)}{ext}'
    oldPath = os.path.join(directory, filename)
    newPath = os.path.join(directory, newName)

    os.rename(oldPath, newPath)
    #print(f"renamed:{filename} - {newName}")

    if idx % 1000==0:
      print(f'renamed{idx} files in {directory}')

print("Staring renaming real fils...")
rename(real, 'real_audiofile')
print("completed!\n")

print("Staring renaming real fils...")
rename(fake, 'fake_auidofile')
print("completed!\n")

Staring renaming real fils...
completed!

Staring renaming real fils...
completed!



In [5]:
#split data
data_dir = '/content/drive/MyDrive/project/raw'
real_dir = os.path.join(data_dir, 'real')
fake_dir = os.path.join(data_dir, 'fake')

train_dir = '/content/drive/MyDrive/project/split_data/train'
test_dir = '/content/drive/MyDrive/project/split_data/test'
val_dir = '/content/drive/MyDrive/project/split_data/val'

for split_dir in [train_dir, test_dir, val_dir]:
    os.makedirs(split_dir, exist_ok=True)
    os.makedirs(os.path.join(split_dir, 'real'), exist_ok=True)
    os.makedirs(os.path.join(split_dir, 'fake'), exist_ok=True)

def split_data(category_dir, category_name):
    files = [f for f in os.listdir(category_dir) if os.path.isfile(os.path.join(category_dir, f))]

    train_files, temp_files = train_test_split(files, test_size=0.2, random_state=42)  # 70% train, 30% for test+validation
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)  # Split the 30% into 15% test, 15% validation


    for file in train_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(train_dir, category_name, file))
    for file in test_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(test_dir, category_name, file))
    for file in val_files:
        shutil.move(os.path.join(category_dir, file), os.path.join(val_dir, category_name, file))

split_data(real_dir, 'real')
split_data(fake_dir, 'fake')

print("Data split completed!")


Data split completed!


In [9]:
DATASET_DIR = '/content/drive/MyDrive/project/split'
Metadata_CSV = '/content/drive/MyDrive/project/metadata.csv'
processes = 4
sample_rate = 16000

csv_columns = [
    'filename', 'file_path', 'label', 'split', 'format', 'duration', 'sample_rate',
    'num_channels', 'bit_depth', 'timestamp', 'notes'
]

def get_properties(file_path):
    try:
        y, sr = librosa.load(file_path, sr=sample_rate, mono=False, duration=10)
        duration = librosa.get_duration(y=y, sr=sr)
        num_channels = 1 if y.ndim == 1 else y.shape[0]

        return {
            'duration': duration,
            'sample_rate': sr,
            'num_channels': num_channels,
            'notes': 'success'
        }
    except Exception as e:
        return {'notes': f'error: {str(e)}'}

def process_file(args):
    file_path, label, split = args
    try:
        metadata = {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'label': label,
            'timestamp': datetime.datetime.now().isoformat(),
            'split': split
        }

        audio_props = get_properties(file_path)
        metadata.update(audio_props)

        mime_type, _ = mimetypes.guess_type(file_path)
        metadata['format'] = mime_type.split('/')[1] if mime_type else 'unknown'

        with open(file_path, 'rb') as f:
            f.seek(34, 0)
            metadata['bit_depth'] = int.from_bytes(f.read(2), byteorder="little")

        return metadata
    except Exception as e:
        return {
            'filename': os.path.basename(file_path),
            'file_path': file_path,
            'label': label,
            'notes': f'critical error: {str(e)}',
            'split': split
        }

def generate_metadata():
    files = []
    splits = ['train', 'test', 'val']

    for split in splits:
        for label, folder_name in [('0', 'real'), ('1', 'fake')]:
            directory = os.path.join(DATASET_DIR, split, folder_name)
            if os.path.exists(directory):
                for filename in os.listdir(directory):
                    file_path = os.path.join(directory, filename)
                    files.append((file_path, label, split))

    with open(Metadata_CSV, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=csv_columns)
        writer.writeheader()

    with Pool(processes) as pool:
        for batch in tqdm(
            [files[i::processes * 4] for i in range(processes * 4)],
            desc='Processing batches',
            total=processes * 4
        ):
            results = list(tqdm(
                pool.imap(process_file, batch),
                total=len(batch),
                desc='Current batch',
                leave=False
            ))

            with open(Metadata_CSV, 'a', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=csv_columns)
                writer.writerows(results)

    print(f'Metadata generation complete. Saved to {Metadata_CSV}')

if __name__ == '__main__':
    generate_metadata()


Processing batches:   0%|          | 0/16 [00:00<?, ?it/s]
Current batch:   0%|          | 0/100 [00:00<?, ?it/s][A
Current batch:   1%|          | 1/100 [00:19<32:13, 19.53s/it][A
Current batch:   4%|▍         | 4/100 [00:21<06:46,  4.23s/it][A
Current batch:   5%|▌         | 5/100 [00:30<08:54,  5.62s/it][A
Current batch:  20%|██        | 20/100 [00:30<01:05,  1.23it/s][A
Current batch:  37%|███▋      | 37/100 [00:30<00:21,  2.90it/s][A
Current batch:  47%|████▋     | 47/100 [00:31<00:13,  3.89it/s][A
Current batch:  54%|█████▍    | 54/100 [00:32<00:10,  4.60it/s][A
Current batch:  59%|█████▉    | 59/100 [00:32<00:07,  5.36it/s][A
Current batch:  63%|██████▎   | 63/100 [00:33<00:06,  6.04it/s][A
Current batch:  66%|██████▌   | 66/100 [00:33<00:05,  6.15it/s][A
Current batch:  69%|██████▉   | 69/100 [00:33<00:04,  6.81it/s][A
Current batch:  72%|███████▏  | 72/100 [00:34<00:03,  7.17it/s][A
Current batch:  75%|███████▌  | 75/100 [00:34<00:02,  8.70it/s][A
Current batch: 

Metadata generation complete. Saved to /content/drive/MyDrive/project/metadata.csv





In [8]:
df = pd.read_csv('/content/drive/MyDrive/project/metadata_file.csv')
print(df)
print(df.describe())
print(df['label'].value_counts())
print(df['split'].value_counts())
print(df['notes'].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/project/metadata_file.csv'