<a href="https://colab.research.google.com/github/rubymanderna/ML_ECGR5105/blob/main/Final_project/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm

In [None]:
def extract_features(file_path, sampling_rate=48000):
    try:
        y, _ = librosa.load(file_path, sr=sampling_rate)

        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sampling_rate)[0]
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sampling_rate)[0]
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sampling_rate)[0]

        mfccs = librosa.feature.mfcc(y=y, sr=sampling_rate)

        chroma = librosa.feature.chroma_stft(y=y, sr=sampling_rate)

        contrast = librosa.feature.spectral_contrast(y=y, sr=sampling_rate)

        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sampling_rate)

        rmse = librosa.feature.rms(y=y)[0]

        # Concatenate all features into a single 1D array
        all_features = [spectral_centroid.mean(), spectral_bandwidth.mean(), spectral_rolloff.mean()] + \
                        list(mfccs.mean(axis=1)) + list(chroma.mean(axis=1)) + list(contrast.mean(axis=1)) + \
                        list(tonnetz.mean(axis=1)) + [rmse.mean()]

        return all_features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [None]:
csv_path = '/content/drive/MyDrive/kaggle/cv-valid-test.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_path)

In [None]:
df_sampled = df.dropna(subset=['age'])
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.tail()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
1537,sample-003976,we've got her located,1,0,thirties,male,england,
1538,sample-003979,she has all your features,1,0,thirties,male,,
1539,sample-003980,i've got to see nicole right away,1,0,fifties,male,australia,
1540,sample-003984,but there were certain of them who took a bit ...,2,0,thirties,male,,
1541,sample-003989,i'm playing for keeps,2,0,fifties,male,australia,


In [None]:
features_list = []
for index, row in tqdm(df_sampled.iterrows(), total=len(df_sampled), desc="Processing files"):
    filename = row['filename']
    age = row['age']

    # Search for the corresponding .mp3 file in the folder
    mp3_path = f'/content/drive/MyDrive/kaggle/cv-valid-test/{filename}.mp3'  # Replace with your actual folder path
    if os.path.exists(mp3_path):
        # Extract features using librosa with try-except
        features = extract_features(mp3_path)
        if features is not None:
            features_list.append([filename, age] + features)
features_df = pd.DataFrame(features_list, columns=['filename', 'age', 'spectral_centroid', 'spectral_bandwidth',
                                                   'spectral_rolloff'] + [f'mfcc_{i}' for i in range(1, 21)] +
                                                  [f'chroma_{i}' for i in range(1, 13)] +
                                                  [f'contrast_{i}' for i in range(1, 8)] +
                                                  [f'tonnetz_{i}' for i in range(1, 7)] + ['rmse'])

# Save the final DataFrame to a new CSV file
output_csv_path = '/content/drive/MyDrive/kaggle/test1_extracted_features.csv'  # Replace with your desired output path
features_df.to_csv(output_csv_path, index=False)

print(f"Processed {len(features_df)} files and saved features to {output_csv_path}")

Processing files: 100%|██████████| 1542/1542 [02:44<00:00,  9.36it/s] 

Processed 152 files and saved features to /content/drive/MyDrive/kaggle/test1_extracted_features.csv



