In [1]:
import os
import fnmatch
import soundfile as sf
from pydub import AudioSegment
import librosa
import numpy as np 
import os
import fnmatch
import numpy as np
import librosa
import torch # import PyTorch
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
import os
import torch
import fnmatch
import librosa
import numpy as np
from pydub import AudioSegment

def convert_m4a_to_wav(file_path, output_path):
    audio = AudioSegment.from_file(file_path, format="m4a")
    audio.export(output_path, format="wav")

def extract_melscale_features(file_name):
    y, sr = librosa.load(file_name)
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mels_db = librosa.power_to_db(mels, ref=np.max)
    return mels_db.flatten() # Flatten the feature array

def extract_melspectrogram(file_name):
    y, sr = librosa.load(file_name)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    return spectrogram.flatten() # Flatten the spectrogram

def extract_audio_features(file_name):
    y, sr = librosa.load(file_name)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    # Change here: Find max pitch for each frame and then take their mean as overall pitch
    pitch = np.mean([pitch[magnitude.argmax()] for pitch, magnitude in zip(pitches, magnitudes)])
    fft = np.abs(np.fft.fft(y))[:len(y)//2]

    # Combine features
    combined_features = np.hstack([mfccs.mean(axis=1), spectral_centroids.mean(), spectral_rolloff.mean(), zero_crossing_rate.mean(), pitch, fft.mean()])

    return combined_features

In [4]:
def extract_mfcc(file_name):
    # Convert m4a file to wav            
    audio = AudioSegment.from_file(file_name)
    audio.export("temp.wav", format="wav")
    data, samplerate = sf.read("temp.wav")
    
    mfccs = librosa.feature.mfcc(y=data, sr=samplerate, n_mfcc=40)
    mfccs_processed = np.mean(mfccs.T,axis=0)
    
    return mfccs_processed

In [7]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

directory = r'F:\dataset_sri'

def save_melscale_features(file_paths, output_directory):
    for file in file_paths:
        # Convert m4a file to wav
        wav_file_path = file.replace('.m4a', '.wav')
        convert_m4a_to_wav(file, wav_file_path)

        # Extract melscale features
        melscale_features = extract_melscale_features(wav_file_path)

        # Save as .pt file
        output_file = os.path.join(output_directory, os.path.basename(file).replace('.m4a', '.pt'))
        torch.save(torch.tensor(melscale_features), output_file)

def save_melspectrogram_images(file_paths, output_directory):
    for file in file_paths:
        # Convert m4a file to wav
        wav_file_path = file.replace('.m4a', '.wav')
        convert_m4a_to_wav(file, wav_file_path)

        # Extract melspectrogram
        melspectrogram = extract_melspectrogram(wav_file_path)

        # Save as image
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(librosa.power_to_db(melspectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel spectrogram')
        plt.tight_layout()
        plt.savefig(os.path.join(output_directory, os.path.basename(file).replace('.m4a', '.png')), dpi=300)
        plt.close()

def save_combined_mfcc_melscale_features(file_paths, output_directory, n_components=40):
    all_features = []
    for file in file_paths:
        # Convert m4a file to wav
        wav_file_path = file.replace('.m4a', '.wav')
        convert_m4a_to_wav(file, wav_file_path)

        # Extract mfccs and melscale features
        mfccs = extract_mfcc(wav_file_path)
        melscale_features = extract_melscale_features(wav_file_path)
        combined_features = np.concatenate([mfccs, melscale_features])

        all_features.append(combined_features)

    # Use PCA to reduce dimensionality
    pca = PCA(n_components=n_components)
    all_features = pca.fit_transform(np.vstack(all_features))

    for i, file in enumerate(file_paths):
        # Save as .pt file
        output_file = os.path.join(output_directory, os.path.basename(file).replace('.m4a', '.pt'))
        torch.save(torch.tensor(all_features[i]), output_file)



    
def save_spectral_features(file_paths, output_directory):
    for file in file_paths:
        # Convert m4a file to wav
        wav_file_path = file.replace('.m4a', '.wav')
        convert_m4a_to_wav(file, wav_file_path)

        # Extract spectral features
        y, sr = librosa.load(wav_file_path)
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)[0]
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitch = np.mean([pitch[magnitude.argmax()] for pitch, magnitude in zip(pitches, magnitudes)])
        fft = np.abs(np.fft.fft(y))[:len(y)//2]
        combined_features = np.concatenate([spectral_centroids, spectral_rolloff, zero_crossing_rate, pitch, fft])

        # Save as .pt file
        output_file = os.path.join(output_directory, os.path.basename(file).replace('.m4a', '.pt'))
        torch.save(torch.tensor(combined_features), output_file)


wav_files = [os.path.join(root, file)
             for root, dirs, files in os.walk(directory)
             for file in fnmatch.filter(files, '*.m4a')]

# Create directories for each type of feature
os.makedirs(r'F:/melscale_features', exist_ok=True)
os.makedirs(r'F:/melspectrogram_images', exist_ok=True)
os.makedirs(r'F:/combined_mfcc_melscale_features', exist_ok=True)
os.makedirs(r'F:/spectral_features', exist_ok=True)

# Save each type of feature
save_melscale_features(wav_files, r'F:\melscale_features')
save_melspectrogram_images(wav_files, r'F:\melspectrogram_images')
save_combined_mfcc_melscale_features(wav_files, r'F:\combined_mfcc_melscale_features')
save_spectral_features(wav_files, r'F:\spectral_features')


IndexError: tuple index out of range

<Figure size 1000x400 with 0 Axes>

In [None]:
import os
import matplotlib.pyplot as plt
import librosa
import librosa.display
import fnmatch
import os
import torch
import fnmatch
import librosa
import numpy as np
from pydub import AudioSegment


def convert_m4a_to_wav(file_path, output_path):
    audio = AudioSegment.from_file(file_path, format="m4a")
    audio.export(output_path, format="wav")

def extract_melspectrogram(file_path, n_fft=2048, hop_length=512, n_mels=128):
    y, sr = librosa.load(file_path)
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    return melspectrogram

def save_melspectrogram_images(file_paths, output_directory):
    for file in file_paths:
        # Convert m4a file to wav
        wav_file_path = file.replace('.m4a', '.wav')
        convert_m4a_to_wav(file, wav_file_path)

        # Extract melspectrogram
        melspectrogram = extract_melspectrogram(wav_file_path)

        # Save as image
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(librosa.power_to_db(melspectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel spectrogram')
        plt.tight_layout()
        plt.savefig(os.path.join(output_directory, os.path.basename(file).replace('.m4a', '.png')), dpi=300)
        plt.close()

directory = r'F:\dataset_sri'

wav_files = [os.path.join(root, file)
             for root, dirs, files in os.walk(directory)
             for file in fnmatch.filter(files, '*.m4a')]

os.makedirs(r'F:\melspectrogram_images', exist_ok=True)

save_melspectrogram_images(wav_files, r'F:\melspectrogram_images')


In [None]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def load_images(folder):
    images = []
    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(np.array(img).flatten())
    return images

# Load images
folder_path = 'F:/melspectrogram_images'  # replace with your folder path
data = load_images(folder_path)

# Assign labels: first 2311 files are female, next files are male
labels = [1 if i < 2311 else 0 for i in range(5993)]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Logistic regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict and calculate accuracy
y_pred = lr.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Classification Report: \n', classification_report(y_test, y_pred))
