# Cleaning

This workbook is dedicated to the cleaning completed on the files within our dataset.

Since we're working with audio files, much of the cleaning that will be done to the audio itself will be done within the Library call in the next section. This workbook provides the cleaning that was necessary for the filepaths to make programmatic access and labelling of each file, possible.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import pickle
import time
import os
import json

from sklearn.decomposition import PCA,TruncatedSVD
from tqdm import tqdm


import librosa
import librosa.display
from scipy.stats import skew
from PIL import Image

### Moving Files

Some of the files are deeply nested in sub-sub-sub folders. We'll programmatically walk through our dataset directory and pull all of the files into a general folder we can more easily access. In this, we'll also pull the metadata from each subfolder.

In [None]:
Set the folder path
folder_path = '/Users/ryan/GA/Projects/Capstone/datasets/Audio/'

Define the new folder path
new_folder_path = '/Users/ryan/GA/Projects/Capstone/datasets/AllFiles/'

if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)

# Iterate over all files in the original folder and its subfolders
for root, dirs, files in os.walk(folder_path):
    for filename in files:
        # Get the full path of the source file
        source_file = os.path.join(root, filename)
        # Generate a unique file name by appending a timestamp
        timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')
        unique_filename = f'{os.path.splitext(filename)[0]}_{timestamp}{os.path.splitext(filename)[1]}'
        # Check if the file already exists in the new folder
        if os.path.exists(os.path.join(new_folder_path, unique_filename)):
            print(f'File already exists: {unique_filename}')
            continue
        # Copy the file to the destination folder with the unique file name
        destination_file = os.path.join(new_folder_path, unique_filename)
        shutil.copy2(source_file, destination_file)
        print(f'Copied file: {filename} to {destination_file}')

### Duplicating Spectrogram Images

We'll need to randomly sample the audio data, so let's combine compile all the spectrograms from each classes in their own folders labelled 'Human' and 'Synthetic'

In [None]:
import os
import shutil

src_dir = "datasets/Spectrograms/fake/"
dst_dir = "datasets/Spectrograms/SYNTHETIC/"

# Create the destination directory if it doesn't exist
if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

# Walk through all subdirectories of the source directory
for dirpath, dirnames, filenames in os.walk(src_dir):
    # Copy all files to the destination directory
    for filename in filenames:
        # Get the source file path
        src_path = os.path.join(dirpath, filename)
        # Get the destination file path
        dst_path = os.path.join(dst_dir, filename)
        # Copy the file to the destination directory
        shutil.copy2(src_path, dst_path)

### Removing Duplicates

Since we're working with many files, I'd like to remove duplicates by comparing the hashes of each file. Many of the files in our dataset have the same filename, which is the case when a dataset contains both real and synthetic audio for the same utterance.

To avoid removing a file of this nature, we'll need to compare the hashes for all the files and remove only duplicate audio.

In [None]:
import os
import hashlib

folder_path = '/Users/ryan/GA/Projects/Capstone/datasets/Audio/'

# loop over all subdirectories in folder_path
for subdir, _, files in os.walk(folder_path):
    # create a dictionary to store the hash values of each file
    hash_dict = {}

    # loop over each file in the current subdirectory
    for file in files:
        # get the full path of the file
        file_path = os.path.join(subdir, file)

        # calculate the MD5 hash of the file
        with open(file_path, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()

        # check if the hash value already exists in the dictionary
        if file_hash in hash_dict.values():
            # if so, delete the file
            os.remove(file_path)
        else:
            # if not, add the hash value to the dictionary
            hash_dict[file] = file_hash

### Removing Empty Audio Files

It's possible that an audio file in our dataset is empty, especially given we are working with unvalidated crowdsourced data from Mozilla. We'll iterate through the dataset and remove any files that are empty by using soundfile to check the audio output and measure the volume of each file.

In [None]:
top_dir = 'datasets/Audio/'

# Walk through the directory tree and loop over all files in all subfolders
import soundfile as sf

audio_extensions = ['.wav', '.aiff', '.aif', '.flac', '.mp3']

for dirpath, dirnames, filenames in os.walk(top_dir):
    for filename in filenames:
        ext = os.path.splitext(filename)[-1].lower()
        if ext in audio_extensions:
            file_path = os.path.join(dirpath, filename)
            try:
                y, sr = sf.read(file_path, always_2d=True)
                if y.size == 0:
                    # Remove the file if it has an empty audio signal
                    os.remove(file_path)
                    print(f'Removed empty audio file: {file_path}')
            except Exception as e:
                print(f'Error loading audio file {file_path}: {e}')

### Converting Mozilla to .wav

The Mozilla dataset contains files that are of type .mp3, which will cause problems when we call the Librosa Library in the next section. We'll use Pydub to iterate through the Mozilla data and convert each file to .wav.

In [None]:
from pydub import AudioSegment
import os

# Set the input and output directories
input_dir = 'datasets/Audio/Mozilla_delta/'
output_dir = 'datasets/Audio/Mozilla_delta(wav)//'

# Loop through all .mp3 files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.mp3'):
        # Load the audio file using pydub
        audio = AudioSegment.from_file(os.path.join(input_dir, filename), format='mp3')

        # Set the output filename and path
        output_filename = filename[:-4] + '.wav' # Remove the '.mp3' extension and replace with '.wav'
        output_path = os.path.join(output_dir, output_filename)

        # Export the audio file in .wav format
        audio.export(output_path, format='wav')

### Image Compression

To reduce the size of our image, we'll use PIL (Pillow) Fork to optimize the quality of our images. By setting quality to 85% and optimize = True, we shouldn't lose a considerable amount of data and we should be able to improve our model run time significantly.

In [None]:
from PIL import Image

In [None]:
directory = "datasets/Spectrograms/HUMAN/"

# Compression quality
quality = 85

# Count the number of files in the directory
num_files = len(os.listdir(directory))

# Iterate through the files with a progress bar
for i, filename in tqdm(enumerate(os.listdir(directory)), total=num_files):
    if filename.endswith(".png"):
        img = Image.open(os.path.join(directory, filename))
        img.save(os.path.join(directory, filename), optimize=True, quality=quality)