In [None]:
# Import necessary libraries
from google.colab import drive  # Import the Google Drive connection module
import zipfile  # Module for handling zip files
import os  # Operating system module
import pandas as pd  # Library for data manipulation and analysis
import numpy as np  # Numerical computing library
import librosa  # Audio processing library
from sklearn.model_selection import train_test_split  # Splitting data into training and testing sets
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, LSTM, Dropout, Reshape
from tensorflow.keras.models import Model, Sequential  # Keras model classes
from tensorflow.keras.optimizers import Adam  # Adam optimizer
from tensorflow.keras.utils import to_categorical  # Utility function for one-hot encoding
from tensorflow.keras.callbacks import EarlyStopping  # Callback for early stopping during training
from tensorflow.image import resize  # Image resizing function

# Load and preprocess audio data
def load_and_preprocess_data(data_excel, target_shape=(128, 128)):
    data = []  # List to store preprocessed audio data
    labels = []  # List to store labels

    # Iterate through rows in the metadata file
    for i, row in data_excel.iterrows():
        # Load audio data using librosa
        audio_data, sample_rate = librosa.load(row['Path'], sr=None)

        # Extract Mel spectrogram and resize it
        mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate)
        mel_spectrogram = resize(np.expand_dims(mel_spectrogram, axis=-1), target_shape)

        # Append data and labels
        data.append(mel_spectrogram)
        labels.append(row['Pronunciation'])

    return np.array(data), np.array(labels)

# Connect to Google Drive
drive.mount('/content/drive')

# Unzip the dataset
with zipfile.ZipFile('/content/drive/MyDrive/ASMDD.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Read metadata file
metadata_file = '/content/ASMDD.csv'
df = pd.read_csv(metadata_file)

# Adjust Path to add Colab path
df['Path'] = '/content/' + df['Path']

# Import Reshape layer
from tensorflow.keras.layers import Reshape

# Replace string labels with numeric values
mapping = {'Right': 1, 'Wrong': 0}
df['Pronunciation'] = df['Pronunciation'].replace(mapping)

# Split data into training and testing sets
data, labels = load_and_preprocess_data(df)
labels = to_categorical(labels, num_classes=2)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Create a neural network model with Long short-term memory (LSTM)
time_steps, frequency_bins = X_train[0].shape[0], X_train[0].shape[1]




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  audio_data, sample_rate = librosa.load(row['Path'], sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Define learning rate scheduler with adjusted parameters
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Define model architecture with batch normalization
model = Sequential()

model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(time_steps, frequency_bins, 1)))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))

model.add(Flatten())

model.add(Dropout(0.5))

model.add(Reshape((-1, frequency_bins * 2)))

model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(256))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with learning rate scheduler
history = model.fit(X_train, y_train, epochs=100, batch_size=16,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping, lr_scheduler])

# Save the model
model.save('audio_model_optimized_v4.h5')

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print the test accuracy
print("Test Accuracy:", test_accuracy)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Test Accuracy: 0.8445883393287659


In [None]:
import librosa  # Library for audio processing
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations
from tensorflow.keras.models import Sequential  # For linear stacking of layers
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, Input, Bidirectional, LSTM, Reshape
from tensorflow.keras.optimizers import Adam  # Optimization method
from tensorflow.keras.utils import to_categorical  # Convert labels to one-hot vectors
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau  # Callbacks for training control
from sklearn.model_selection import train_test_split  # Splitting data into training and testing
from tensorflow.keras.regularizers import l2  # Regularization
from sklearn.preprocessing import StandardScaler  # Standardization of datasets
from google.colab import drive  # Import the Google Drive connection module
import zipfile  # Module for handling zip files
import os  # Operating system module

def load_and_preprocess_data(data_excel, target_shape=(96, 64)):
    data = []
    labels = []
    # Iterate through each row in the dataframe
    for i, row in data_excel.iterrows():
        # Load the audio file, ensuring a consistent sampling rate
        audio_data, sample_rate = librosa.load(row['Path'], sr=44100)
        # Compute mel spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=target_shape[0])
        # Convert power spectrogram to decibel units
        mel_spectrogram = librosa.power_to_db(mel_spectrogram)
        # Resize the mel spectrogram to fit the model input
        mel_spectrogram = np.resize(mel_spectrogram, target_shape)
        # Append the preprocessed spectrogram and label to the lists
        data.append(mel_spectrogram)
        labels.append(row['Pronunciation'])
    # Convert lists to numpy arrays for processing in machine learning models
    return np.array(data), np.array(labels)


# Connect to Google Drive
drive.mount('/content/drive')

# Unzip the dataset
with zipfile.ZipFile('/content/drive/MyDrive/ASMDD.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')
metadata_file = '/content/ASMDD.csv'
df = pd.read_csv(metadata_file)
df['Path'] = '/content/' + df['Path']
mapping = {'Right': 1, 'Wrong': 0}
df['Pronunciation'] = df['Pronunciation'].replace(mapping)

data, labels = load_and_preprocess_data(df)
labels = to_categorical(labels, num_classes=2)


X_train, X_test, y_train, y_test= train_test_split(data, labels, test_size=0.3, random_state=42)



model = Sequential([
    Input(shape=(96, 64, 1)),  # Input layer specifying input shape
    Conv2D(32, (3, 3), activation='relu'),  # 2D Convolutional layer
    BatchNormalization(),  # Normalize the activations of the previous layer
    MaxPooling2D((2, 2)),  # Max Pooling to reduce spatial dimensions
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Reshape((15, 512)),  # Reshape output for LSTM layers
    Bidirectional(LSTM(128, return_sequences=True)),  # Bidirectional LSTM
    Bidirectional(LSTM(128)),
    Dropout(0.5),  # Dropout for regularization
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # Output layer with softmax activation
])




model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=22, batch_size=16)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  audio_data, sample_rate = librosa.load(row['Path'], sr=44100)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Epoch 1/22
Epoch 2/22
Epoch 3/22
Epoch 4/22
Epoch 5/22
Epoch 6/22
Epoch 7/22
Epoch 8/22
Epoch 9/22
Epoch 10/22
Epoch 11/22
Epoch 12/22
Epoch 13/22
Epoch 14/22
Epoch 15/22
Epoch 16/22
Epoch 17/22
Epoch 18/22
Epoch 19/22
Epoch 20/22
Epoch 21/22
Epoch 22/22
Test Accuracy: 0.8008631467819214
