PCA for feature extraction, then logistic regression

Convolutional Neural Networks

Spectrograms → image processing (i.e. k-nearest neighbors)


In [None]:
from torch.utils.data import random_split
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

import math
import random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
import pandas as pd
import numpy as np
import os

In [None]:
class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    def rechannel(aud, new_channel):
        sig, sr = aud
        if (sig.shape[0] == new_channel):
            # Nothing to do
            return aud

        if (new_channel == 1):
            # Convert from stereo to mono by selecting only the first channel
            resig = sig[:1, :]
        else:
            # Convert from mono to stereo by duplicating the first channel
            resig = torch.cat([sig, sig])

        return ((resig, sr))

    def resample(aud, newsr):
        sig, sr = aud

        if (sr == newsr):
            # Nothing to do
            return aud

        num_channels = sig.shape[0]
        # Resample first channel
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1, :])
        if (num_channels > 1):
            # Resample the second channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:, :])
            resig = torch.cat([resig, retwo])

        return ((resig, newsr))

    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms

        if (sig_len > max_len):
            # Truncate the signal to the given length
            sig = sig[:, :max_len]

        elif (sig_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len

            # Pad with 0s
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))

            sig = torch.cat((pad_begin, sig, pad_end), 1)

        return (sig, sr)

    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig, sr = aud
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(
            sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)

    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(
                freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(
                time_mask_param)(aug_spec, mask_value)

        return aug_spec

In [None]:
class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4

    # ----------------------------
    # Number of items in dataset
    # ----------------------------
    def __len__(self):
        return len(self.df)

    # ----------------------------
    # Get i'th item in dataset
    # ----------------------------
    def __getitem__(self, idx):
        # Absolute file path of the audio file - concatenate the audio directory with
        # the relative path
        audio_file = self.data_path + self.df.loc[idx, 'relative_path']
        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        # Some sounds have a higher sample rate, or fewer channels compared to the
        # majority. So make all sounds have the same number of channels and same
        # sample rate. Unless the sample rate is the same, the pad_trunc will still
        # result in arrays of different lengths, even though the sound duration is
        # the same.
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)

        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(
            shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(
            sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return aug_sgram, class_id

In [None]:
data = np.empty((1125, 2), dtype=object)
count = 0
#directory = "/Users/ariellesanford/Desktop/ELEC378/data/data"
directory = "/Users/rch/Documents/RICE UNIVERSITY/JUNIOR SPRING/ELEC 378 Machine Learning/ELEC 378 Project/elec-378-sp2023-speech-emotion-classification/data/data"
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file

    if os.path.isfile(f):
        name = filename[:len(filename)-7]
        data[count][0] = "/"+filename
        data[count][1] = name
        count += 1

df = pd.DataFrame(data)
df.rename(columns={0: "relative_path", 1: "classID"}, inplace=True)
myds = SoundDS(df, directory)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 1)
num_val = num_items - num_train
train_ds,_ = random_split(myds, [num_train, num_val])

directory = "/Users/rch/Documents/RICE UNIVERSITY/JUNIOR SPRING/ELEC 378 Machine Learning/ELEC 378 Project/elec-378-sp2023-speech-emotion-classification/test/test"

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file

    if os.path.isfile(f):
        name = filename[:len(filename)-7]
        data[count][0] = "/"+filename
        data[count][1] = name
        count += 1

df = pd.DataFrame(data)
df.rename(columns={0: "relative_path", 1: "classID"}, inplace=True)
valds = SoundDS(df, directory)


##https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5
##all above code is heavily influenced from the resource above

In [None]:
##Training data
X = []
y = []

for i in range(len(train_ds)):
    sample = train_ds[i]
    X.append(sample[0])
    y.append(sample[1])

X_train = np.concatenate([spectrogram.flatten().reshape(1, -1) for spectrogram in X])

label_to_number = {}
for label in set(y):
    label_to_number[label] = len(label_to_number)

# Map each label to its corresponding number
y_train = [label_to_number[label] for label in y]

In [None]:
##Testing data
Xval = []
yval = []
for i in range(len(val_ds)):
    sample = val_ds[i]
    Xval.append(sample[0])
    yval.append(sample[1])

X_test = np.concatenate([spectrogram.flatten().reshape(1, -1) for spectrogram in Xval])

label_to_number = {}
for label in set(yval):
    label_to_number[label] = len(label_to_number)

# Map each label to its corresponding number
y_test = [label_to_number[label] for label in yval]

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#training
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
# Predict on validation data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
#Convert to kaggle upload
number_to_label = {v: k for k, v in label_to_number.items()}

# Map each number back to its corresponding label
y_kaggle = [number_to_label[number]b for number in y_test]



In [None]:
from google.colab import drive
drive.mount('/content/drive')
