In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


**Exploratory Data Analysis (EDA)**

In [10]:
import os
import zipfile

# Unzipping the dataset
zip_path = "/content/gdrive/MyDrive/Data Preprocessing Datasets/Chord Dataset.zip"
extract_path = "/content/gdrive/MyDrive/Data Preprocessing Datasets/Chord Dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [11]:
inner_directory = os.path.join(extract_path, "Chord Dataset")
os.listdir(inner_directory)

['Major', 'Minor']

In [12]:
major_directory = os.path.join(inner_directory, "Major")
minor_directory = os.path.join(inner_directory, "Minor")

In [15]:
# Counting the number of audio files in both "Major" and "Minor" directories
major_files = [f for f in os.listdir(major_directory) if f.endswith('.wav')]
minor_directory = os.path.join(inner_directory, "Minor")
minor_files = [f for f in os.listdir(minor_directory) if f.endswith('.wav')]

len(major_files), len(minor_files)

(10, 10)

In [16]:
import wave

def extract_audio_properties_wave(file_path):
    with wave.open(file_path, 'rb') as wf:
        n_frames = wf.getnframes()
        frame_rate = wf.getframerate()
        duration = n_frames / float(frame_rate)
        return {"sample_rate": frame_rate, "duration": duration}

In [17]:
# Redefining the file paths for the sample audio files
sample_major_file = os.path.join(major_directory, major_files[0])
sample_minor_file = os.path.join(minor_directory, minor_files[0])

# Extract properties from the sample files using the updated method
major_properties_wave = extract_audio_properties_wave(sample_major_file)
minor_properties_wave = extract_audio_properties_wave(sample_minor_file)

major_properties_wave, minor_properties_wave


({'sample_rate': 44100, 'duration': 2.25},
 {'sample_rate': 44100, 'duration': 2.25})

**Data Preprocessing**

In [18]:
from scipy.fft import fft
import numpy as np

# Function to extract power spectrum features from an audio file using the wave module
def extract_power_spectrum_features(file_path):
    # Load the audio file using wave module
    with wave.open(file_path, 'rb') as wf:
        signal = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)

    # Calculate the Fourier Transform of the signal
    spectrum = fft(signal)

    # Calculate the power spectrum
    power_spectrum = np.abs(spectrum[:len(spectrum)//2])

    return power_spectrum

In [20]:
# Extract power spectrum features for all audio files
major_features = [extract_power_spectrum_features(os.path.join(major_directory, file)) for file in major_files]
minor_features = [extract_power_spectrum_features(os.path.join(minor_directory, file)) for file in minor_files]

# Convert lists to numpy arrays for easier manipulation
major_features = np.array(major_features)
minor_features = np.array(minor_features)

  major_features = np.array(major_features)
  minor_features = np.array(minor_features)


In [21]:
# Determine the maximum length among all feature arrays
max_length = max(max([len(feature) for feature in major_features]),
                max([len(feature) for feature in minor_features]))

# Function to truncate or zero-pad the feature arrays to ensure uniform length
def pad_features(features, max_length):
    padded_features = []
    for feature in features:
        if len(feature) < max_length:
            # Zero-pad
            padded_feature = np.pad(feature, (0, max_length - len(feature)))
        else:
            # Truncate
            padded_feature = feature[:max_length]
        padded_features.append(padded_feature)
    return np.array(padded_features)

In [22]:
# Pad the features for both major and minor categories
major_features_padded = pad_features(major_features, max_length)
minor_features_padded = pad_features(minor_features, max_length)

major_features_padded.shape, minor_features_padded.shape

((10, 50714), (10, 50714))

In [23]:
from sklearn.preprocessing import StandardScaler

# Normalize the features using StandardScaler
scaler = StandardScaler()
all_features = np.vstack((major_features_padded, minor_features_padded))
scaler.fit(all_features)

major_features_normalized = scaler.transform(major_features_padded)
minor_features_normalized = scaler.transform(minor_features_padded)

major_features_normalized.shape, minor_features_normalized.shape

((10, 50714), (10, 50714))

In [24]:
from sklearn.model_selection import train_test_split

# Combine the features and assign labels
X = np.vstack((major_features_normalized, minor_features_normalized))
y = np.array([0] * len(major_features_normalized) + [1] * len(minor_features_normalized))

# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16, 50714), (4, 50714), (16,), (4,))

In [26]:
pip install h2o

Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=2de689443afbee18d3731d3727503c8523c86bddd0880ae1c026e093a8eaa4d7
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


**Model Building using H2O AutoML**

In [27]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(max_mem_size="4G")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpz756wmoj
  JVM stdout: /tmp/tmpz756wmoj/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpz756wmoj/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_unknownUser_u4a272
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [28]:
# Combine features and labels for ease of conversion
data = np.hstack((X, y.reshape(-1, 1)))
h2o_frame = h2o.H2OFrame(data)

# Specify the name of the target variable
target = h2o_frame.columns[-1]
predictors = h2o_frame.columns[:-1]

# Split the data into train and test sets (you can use the same split ratio as before)
train, test = h2o_frame.split_frame(ratios=[0.8], seed=42)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [29]:
# Set a maximum time for the AutoML run and other configuration parameters as needed
aml = H2OAutoML(max_runtime_secs=600, seed=42, project_name="Chord_Classification")
aml.train(x=predictors, y=target, training_frame=train)

AutoML progress: |
23:34:42.43: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

████████████
23:36:28.765: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

████████████████████████████
23:41:03.131: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
23:41:03.131: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 18.0.


23:41:03.873: _response param, We have detected that your response column has only 2 uni

Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,gaussian,identity,Ridge ( lambda = 29.022 ),"nlambda = 30, lambda.max = 39.871, lambda.min = 29.022, lambda.1se = 29.022",50714,50714,2,AutoML_1_20231031_233440_training_py_2_sid_90d1

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.4089764,0.076825,0.333838,0.5142822,0.3330057,0.4267443,0.4370116
mean_residual_deviance,0.1943215,0.0561076,0.1580841,0.2757549,0.128066,0.2091026,0.2005997
mse,0.1943215,0.0561076,0.1580841,0.2757549,0.128066,0.2091026,0.2005997
null_deviance,0.9002449,0.1645403,1.0204082,1.0204082,1.0204082,0.72,0.72
r2,0.1817439,0.2405592,0.3676636,-0.1030195,0.4877362,0.0590383,0.0973013
residual_deviance,0.6953453,0.2329524,0.6323364,1.1030195,0.5122638,0.6273078,0.6017992
rmse,0.4371491,0.0634635,0.3975979,0.5251237,0.357863,0.4572774,0.4478836
rmsle,0.3125766,0.031794,0.2842964,0.3574383,0.2795313,0.3194708,0.3221461

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_xval,deviance_se,alpha,iterations,training_rmse,training_deviance,training_mae,training_r2
,2023-10-31 23:40:50,0.000 sec,1,40.0,50715,0.4114308,0.3010779,0.0598974,0.0,,,,,
,2023-10-31 23:40:54,3.762 sec,2,29.0,50715,0.3652588,0.2100421,0.0200049,0.0,,,,,
,2023-10-31 23:40:58,7.887 sec,3,21.0,50715,0.3307755,0.2362859,0.0327927,0.0,3.0,0.4273516,0.1826294,0.414029,0.2603509

variable,relative_importance,scaled_importance,percentage
C937,0.0000391,1.0,0.0001125
C938,0.0000386,0.9870716,0.0001111
C936,0.0000380,0.9718338,0.0001094
C910,0.0000375,0.9608709,0.0001081
C3093,0.0000368,0.9406017,0.0001059
C911,0.0000366,0.9378572,0.0001055
C909,0.0000360,0.9217199,0.0001037
C939,0.0000359,0.9185360,0.0001034
C940,0.0000349,0.8923438,0.0001004
C935,0.0000348,0.8910706,0.0001003


In [30]:
# View the leaderboard
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

# Predict using the leader model
preds = aml.leader.predict(test)

# Calculate performance metrics
performance = aml.leader.model_performance(test)
print(performance)

model_id                                                     rmse       mse       mae     rmsle    mean_residual_deviance
GLM_1_AutoML_1_20231031_233440                           0.457203  0.209035  0.437108  0.325939                  0.209035
StackedEnsemble_BestOfFamily_1_AutoML_1_20231031_233440  0.486691  0.236868  0.472719  0.340408                  0.236868
XGBoost_1_AutoML_1_20231031_233440                       0.50007   0.25007   0.494518  0.349699                  0.25007
XGBoost_2_AutoML_1_20231031_233440                       0.501739  0.251742  0.501667  0.359593                  0.251742
StackedEnsemble_BestOfFamily_2_AutoML_1_20231031_233440  0.506154  0.256191  0.500443  0.356786                  0.256191
StackedEnsemble_AllModels_1_AutoML_1_20231031_233440     0.552379  0.305122  0.539159  0.390809                  0.305122
DRF_1_AutoML_1_20231031_233440                           0.808484  0.653646  0.659722  0.56175                   0.653646
[7 rows x 6 columns]

glm

In [32]:
h2o.cluster().shutdown()