<a href="https://colab.research.google.com/github/roxyrong/emotion_detection/blob/main/Feature_Extraction_and_Combination_(New).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import modules for google cloud
from google.colab import auth
from google.cloud import storage
auth.authenticate_user()
client = storage.Client()

import math
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display

In [2]:
# === connect to SAVEE dataset
bucket = client.get_bucket('savee')
blob = bucket.blob('augmented_dataset.npy')
blob.download_to_filename('augmented_dataset.npy')
blob = bucket.blob('augmented_dataset.csv')
blob.download_to_filename('augmented_dataset.csv')

In [3]:
# === Function: Convert npy to dataframe
def load_data(npy_data_path,csv_data_path):
    '''

    :param npy_data_path: NPY data Path
    :param csv_data_path:  CSV data Path
    :return: feature extraction dataframe
    '''
    audio_feature_file = np.load(file=npy_data_path,allow_pickle=True)
    data = pd.read_csv(csv_data_path)
    column_list = data.columns.to_list()
    augmented_data = pd.DataFrame(columns = column_list)
    for i in range(0,len(audio_feature_file)):
        row_list = []
        for item in range(0,len(column_list)):
            row_list.append(audio_feature_file[i][item])
        augmented_data.loc[len(augmented_data)+1] = row_list
    print(augmented_data.info())
    return augmented_data

In [4]:
df = load_data('augmented_dataset.npy', 'augmented_dataset.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1920 entries, 1 to 1920
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   data       1920 non-null   object
 1   path       1920 non-null   object
 2   dataset    1920 non-null   object
 3   emotion    1920 non-null   object
 4   speaker    1920 non-null   object
 5   augmented  1920 non-null   bool  
dtypes: bool(1), object(5)
memory usage: 91.9+ KB
None


In [5]:
### ====Setup Constant

SAMPLE_RATE = 22050
FRAME_SIZE = 2048
HOP_SIZE = 512

SPLIT_FREQ = 2000
F_MAX = 8000

In [6]:
### === Features Util Function

# AE
def amplitude_envelope(signal, frame_size, hop_length):
    return np.array([max(signal[i:i+frame_size]) for i in range(0, len(signal), hop_length)])

# BER
def calculate_split_frequency_bin(split_frequency, sample_rate, num_frequency_bins):
    """Infer the frequency bin associated to a given split frequency."""
    
    frequency_range = sample_rate / 2
    frequency_delta_per_bin = frequency_range / num_frequency_bins
    split_frequency_bin = math.floor(split_frequency / frequency_delta_per_bin)
    return int(split_frequency_bin)

def band_energy_ratio(spectrogram, split_frequency, sample_rate):
    """Calculate band energy ratio with a given split frequency."""
    
    split_frequency_bin = calculate_split_frequency_bin(split_frequency, sample_rate, len(spectrogram[0]))
    band_energy_ratio = []
    

    # calculate power spectrogram
    power_spectrogram = np.abs(spectrogram) ** 2
    power_spectrogram = power_spectrogram.T
    
    # calculate BER value for each frame
    for frame in power_spectrogram:
        sum_power_low_frequencies = frame[:split_frequency_bin].sum()
        sum_power_high_frequencies = frame[split_frequency_bin:].sum()
        band_energy_ratio_current_frame = sum_power_low_frequencies / sum_power_high_frequencies
        band_energy_ratio.append(band_energy_ratio_current_frame)
    
    return np.array(band_energy_ratio)

# teager
def teager_energy_operator(y):
    """Compute the Teager Energy Operator (TEO) for a given signal."""
    teo = np.zeros_like(y)
    teo[1:-1] = y[1:-1]**2 - y[:-2] * y[2:]
    return teo


In [7]:
features_dict = {}
for id, row in df.iterrows():
  temp_data = row['data']
  temp_data = librosa.effects.trim(temp_data, top_db=10)[0]

  path = row['path']
  print('process ' + path)
  row_dict = {}


### === Prosodic features - frequency

  # fundamental frequency f0
  f0, _, _ = librosa.pyin(temp_data, sr=SAMPLE_RATE, 
                          frame_length=FRAME_SIZE, hop_length=HOP_SIZE, 
                          fmin=librosa.note_to_hz('C2'), 
                          fmax=librosa.note_to_hz('C7')
                          ,fill_na=librosa.note_to_hz('C2'))
  row_dict['f0'] = f0
  
  # pitch features
  row_dict['pitch_mean'] = np.mean(f0)
  row_dict['pitch_median'] = np.median(f0)
  row_dict['pitch_std'] = np.std(f0)
  row_dict['pitch_range'] = np.max(f0) - np.min(f0)
  row_dict['pitch_max'] = max(f0)


### === Prosodic features: amplitude 

  # loudness
  S, phase = librosa.magphase(librosa.stft(temp_data))
  rms = librosa.feature.rms(S=S)[0]

  # loudness features
  row_dict['energy_mean'] = np.mean(rms)
  row_dict['energy_median'] = np.median(rms)
  row_dict['energy_std']= np.std(rms)
  row_dict['energy_range'] = np.max(rms) - np.min(rms)
  row_dict['energy_max'] = np.max(rms)


### === Prosodic features: Jitter & Shimmer
  # Jitter
  row_dict['jitter'] = np.mean(np.abs(np.diff(f0)))

  # Shimmer
  row_dict['shimmer'] = np.mean(np.abs(np.diff(20 * np.log10(rms))))



### === Time Domain Features: 
  # Amplitude Envelope
  row_dict['amplitude_envelope'] = amplitude_envelope(temp_data, FRAME_SIZE, HOP_SIZE)
  # Root Mean Square
  row_dict['rms'] = rms
  # Zero Crossing Rate (ZCR)
  row_dict['zcr']= librosa.feature.zero_crossing_rate(temp_data, 
                                                      frame_length=FRAME_SIZE, 
                                                      hop_length=HOP_SIZE)[0]


### === Frequency Domain Features: 

  # Band Energy Ratio (BER) - split at 2000 Hz
  spec = librosa.stft(temp_data, n_fft=FRAME_SIZE, hop_length=HOP_SIZE) # spectrogram
  row_dict['ber'] = band_energy_ratio(spec, SPLIT_FREQ, SAMPLE_RATE)

  # Amplitude Spectrogram
  row_dict['amplitude_spectrogram'] = spec
  # Decible Spectrogram
  row_dict['dB_spectrogram'] = librosa.amplitude_to_db(abs(spec), ref=np.max)

  # Spectral Contrast (2D)
  row_dict['spec_contrast'] = librosa.feature.spectral_contrast(S=spec, 
                                                                sr=SAMPLE_RATE)

  # Spectral Flux (2D)
  row_dict['spec_flux'] = librosa.onset.onset_strength(y=temp_data, sr=SAMPLE_RATE, 
                                                       aggregate=np.median, 
                                                       fmax=F_MAX, n_mels=128)
  
  # Spectral Centroid (1D)
  row_dict['spec_centroid'] = librosa.feature.spectral_centroid(y=temp_data, 
                                                                sr=SAMPLE_RATE, 
                                                                n_fft=FRAME_SIZE, 
                                                                hop_length=HOP_SIZE)[0]

  # Spectral Bandwidth (1D)
  row_dict['spec_bandwidth']= librosa.feature.spectral_bandwidth(y=temp_data, 
                                                                 sr=SAMPLE_RATE, 
                                                                 n_fft=FRAME_SIZE, 
                                                                 hop_length=HOP_SIZE)[0]

  # Spectral Flatnexx (1D)
  row_dict['spec_flatness'] = librosa.feature.spectral_flatness(y=temp_data,
                                                                n_fft=FRAME_SIZE, 
                                                                hop_length=HOP_SIZE)[0]


### === Spectrograms

  # power_spetrogram
  stft = librosa.core.stft(temp_data, n_fft = FRAME_SIZE, hop_length = HOP_SIZE)
  row_dict['power_spetrogram'] = librosa.amplitude_to_db(abs(stft))

  # mel_spectrogram
  y_stft = librosa.power_to_db(np.abs(stft) ** 2)
  mel_spec = librosa.feature.melspectrogram(S=y_stft, sr=SAMPLE_RATE, 
                                            n_fft=FRAME_SIZE, 
                                            hop_length=HOP_SIZE, 
                                            n_mels=128)
  row_dict['mel_spetrogram'] = mel_spec
  
  # mel_db_spectrogram
  mel_spec_dB = librosa.power_to_db(mel_spec, ref=np.max)
  row_dict['mel_dB_spectrogram'] = mel_spec_dB


### === MFCCs
  mfcc = librosa.feature.mfcc(y=temp_data, n_mfcc=40, sr=SAMPLE_RATE)
  row_dict['mfcc'] = mfcc

  mfcc_delta = librosa.feature.delta(mfcc)
  row_dict['mfcc_delta'] = mfcc_delta

  mfcc_delta_2 = librosa.feature.delta(mfcc , order=2)
  row_dict['mfcc_delta'] = mfcc_delta_2


### === Teager Energy Operator
  teager_energy = teager_energy_operator(temp_data)
  row_dict['teo'] = teager_energy


  features_dict[id] = row_dict

process AudioData/DC/a01.wav


  valley[..., k, :] = np.mean(sortedr[..., :idx, :], axis=-2)
  peak[..., k, :] = np.mean(sortedr[..., -idx:, :], axis=-2)


process AudioData/DC/a02.wav
process AudioData/DC/a03.wav
process AudioData/DC/a04.wav
process AudioData/DC/a05.wav
process AudioData/DC/a06.wav
process AudioData/DC/a07.wav
process AudioData/DC/a08.wav
process AudioData/DC/a09.wav
process AudioData/DC/a10.wav
process AudioData/DC/a11.wav
process AudioData/DC/a12.wav
process AudioData/DC/a13.wav
process AudioData/DC/a14.wav
process AudioData/DC/a15.wav
process AudioData/DC/d01.wav
process AudioData/DC/d02.wav
process AudioData/DC/d03.wav
process AudioData/DC/d04.wav
process AudioData/DC/d05.wav
process AudioData/DC/d06.wav
process AudioData/DC/d07.wav
process AudioData/DC/d08.wav
process AudioData/DC/d09.wav
process AudioData/DC/d10.wav
process AudioData/DC/d11.wav
process AudioData/DC/d12.wav
process AudioData/DC/d13.wav
process AudioData/DC/d14.wav
process AudioData/DC/d15.wav
process AudioData/DC/f01.wav
process AudioData/DC/f02.wav
process AudioData/DC/f03.wav
process AudioData/DC/f04.wav
process AudioData/DC/f05.wav
process AudioD

In [8]:
df_features = pd.DataFrame.from_dict(features_dict, orient='index')

In [9]:
df_features = df.join(df_features)
print(df_features.head(5))

                                                data                  path  \
1  [0.035011273, 0.052110124, 0.0455472, 0.049692...  AudioData/DC/a01.wav   
2  [0.028584875, 0.043024283, 0.038623992, 0.0423...  AudioData/DC/a02.wav   
3  [0.029121982, 0.043259364, 0.037821554, 0.0412...  AudioData/DC/a03.wav   
4  [0.028819187, 0.042954646, 0.037683364, 0.0410...  AudioData/DC/a04.wav   
5  [0.01032823, 0.015278679, 0.013250433, 0.01440...  AudioData/DC/a05.wav   

  dataset emotion speaker  augmented  \
1   SAVEE       a      DC      False   
2   SAVEE       a      DC      False   
3   SAVEE       a      DC      False   
4   SAVEE       a      DC      False   
5   SAVEE       a      DC      False   

                                                  f0  pitch_mean  \
1  [65.40639132514966, 168.66611791365605, 152.89...  148.356286   
2  [65.40639132514966, 65.40639132514966, 221.274...  141.370603   
3  [65.40639132514966, 160.1218501126406, 178.695...  184.753624   
4  [65.40639132514

In [None]:
np.save(file="raw_audio_featrues_extraction_0324.npy", arr=df_features)
df_features.to_csv("raw_audio_featrues_extraction_0324.csv",index=False)

In [None]:
blob = bucket.blob('raw_audio_featrues_extraction_0324.npy')
blob.upload_from_filename('raw_audio_featrues_extraction_0324.npy')
blob = bucket.blob('raw_audio_featrues_extraction_0324.csv')
blob.upload_from_filename('raw_audio_featrues_extraction_0324.csv')