# **For Audio Preprocessing**

In [1]:
import os
import glob

# Windows-style path
folder_path = r"C:\Users\sreem\OneDrive\Documents\Yazh\yazh1.0\audio"

# Get all .wav file paths
file_paths = glob.glob(os.path.join(folder_path, "*.wav"))

# Optional: print file names
print("Found", len(file_paths), "files")


Found 34 files


In [2]:
import os

# Use file name (e.g., song1.wav → "song1") as label
labels = [os.path.splitext(os.path.basename(path))[0] for path in file_paths]


In [3]:
import librosa
import numpy as np

def preprocess_audio_librosa(file_path, sr=16000, duration=5.0):
    y, _ = librosa.load(file_path, sr=sr, mono=True, duration=duration)
    y = y / (np.max(np.abs(y)) + 1e-9)
    desired_length = int(sr * duration)
    if len(y) < desired_length:
        y = np.pad(y, (0, desired_length - len(y)))
    else:
        y = y[:desired_length]
    return y

# Apply to all files
audio_clips = [preprocess_audio_librosa(fp) for fp in file_paths]


In [5]:
%pip install librosa


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

audio_clips = np.array(audio_clips, dtype=np.float32)
encoded_labels = np.array(encoded_labels, dtype=np.int32)

# Build TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((audio_clips, encoded_labels))
dataset = dataset.shuffle(buffer_size=len(audio_clips)).batch(16).prefetch(tf.data.AUTOTUNE)


In [8]:
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.
Collecting tensorflow
  Using cached tensorflow-2.19.0-cp310-cp310-win_amd64.whl (375.7 MB)
Collecting astunparse>=1.6.0
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting keras>=3.5.0
  Using cached keras-3.10.0-py3-none-any.whl (1.4 MB)
Collecting grpcio<2.0,>=1.24.3
  Using cached grpcio-1.72.1-cp310-cp310-win_amd64.whl (4.2 MB)
Collecting h5py>=3.11.0
  Using cached h5py-3.13.0-cp310-cp310-win_amd64.whl (3.0 MB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1
  Using cached gast-0.6.0-py3-none-any.whl (21 kB)
Collecting tensorboard~=2.19.0
  Using cached tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
Collecting ml-dtypes<1.0.0,>=0.5.1
  Using cached ml_dtypes-0.5.1-cp310-cp310-win_amd64.whl (209 kB)
Collecting absl-py>=1.0.0
  Using cached absl_py-2.3.0-py3-none-any.whl (135 kB)
Collecting rich
  Using cached r


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
%pip uninstall tensorflow -y
%pip cache purge
%pip install tensorflow --upgrade



Note: you may need to restart the kernel to use updated packages.




Files removed: 2
Note: you may need to restart the kernel to use updated packages.
Collecting tensorflowNote: you may need to restart the kernel to use updated packages.

  Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl (375.7 MB)
     -------------------------------------- 375.7/375.7 MB 1.4 MB/s eta 0:00:00
Installing collected packages: tensorflow


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\sreem\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\tensorflow\\core\\framework\\cpp_shape_inference_pb2.py'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
for audio_batch, label_batch in dataset.take(1):
    print("Audio batch shape:", audio_batch.shape)
    print("Label batch shape:", label_batch.shape)
    print("First label batch:", label_batch.numpy())
    print("First audio sample (first few values):", audio_batch[0].numpy()[:10])


Audio batch shape: (16, 80000)
Label batch shape: (16,)
First label batch: [21  7 30 31  8 14 25 20  1 29 28 11 22 19 18 24]
First audio sample (first few values): [ 0.05426779  0.03880611  0.18904588  0.10134231  0.09906878  0.03747728
 -0.00796641  0.10105219  0.21702921  0.11483771]


In [11]:
audio_np = []
labels_np = []

for x, y in dataset:
    audio_np.append(x.numpy())
    labels_np.append(y.numpy())

audio_np = np.concatenate(audio_np, axis=0)
labels_np = np.concatenate(labels_np, axis=0)

print("Total samples:", len(audio_np))
print("Labels:", labels_np)


Total samples: 34
Labels: [23 25  1  7 12 19 24 13 28  6  2 26 22 11 31  4 21 18 14  8 29 27 33 32
  0  9 15 16 30  3 20 10 17  5]


In [12]:
# Unbatch first to see individual samples
for spec, label in dataset.unbatch().take(5):
    label_value = tf.squeeze(label).numpy()  # Ensure it's scalar
    song_name = label_encoder.inverse_transform([label_value])[0]
    print("Song name:", song_name)


Song name: Golden NeonNiteClub1
Song name: The Hush#1
Song name: keshi - less of you #2
Song name:  2Back Up Friend
Song name: Likhe Jo Khat #1


Mel Spectrogram Conversion for Ease of Use


In [13]:
import librosa

def extract_mel_spectrogram(waveform, sr=16000, n_mels=128):
    S = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels)
    
    S_dB = librosa.power_to_db(S, ref=np.max)
    return S_dB


In [14]:
mel_specs = [extract_mel_spectrogram(waveform) for waveform in audio_clips]


In [16]:
padded_specs = [extract_mel_spectrogram(spec) for spec in mel_specs]
padded_specs = tf.stack(padded_specs)  # Final shape: (num_samples, 128, 128, 1)




In [17]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
encoded_labels = tf.convert_to_tensor(encoded_labels, dtype=tf.int32)


In [18]:
dataset = tf.data.Dataset.from_tensor_slices((padded_specs, encoded_labels))
dataset = dataset.shuffle(buffer_size=len(padded_specs)).batch(16).prefetch(tf.data.AUTOTUNE)


In [19]:
import tensorflow as tf

def pad_spectrogram(spec, target_shape=(128, 128)):
    """
    Resize and pad Mel spectrogram to a fixed shape.
    
    Parameters:
    - spec: 2D Mel spectrogram (time x freq)
    - target_shape: Tuple (height, width) to pad/resize to
    
    Returns:
    - Resized and padded spectrogram as a 3D tensor (height, width, 1)
    """
    spec = tf.convert_to_tensor(spec, dtype=tf.float32)
    # Add channel dimension -> (height, width, 1)
    spec = spec[..., tf.newaxis]
    # Resize with padding
    spec = tf.image.resize_with_pad(spec, target_shape[0], target_shape[1])
    return spec  # Shape: (128, 128, 1)


In [21]:
# mel_specs: list or np.array of 2D spectrograms (time x freq)

padded_specs = [pad_spectrogram(spec) for spec in mel_specs]
padded_specs = tf.stack(padded_specs)  # Shape: (num_samples, 128, 128, 1)

# Convert labels
labels_tensor = tf.convert_to_tensor(encoded_labels, dtype=tf.int32)

# Build TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((padded_specs, labels_tensor))
dataset = dataset.shuffle(buffer_size=len(padded_specs)).batch(16).prefetch(tf.data.AUTOTUNE)


In [22]:
from tensorflow.keras import layers, models

def build_cnn_lstm_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Input(shape=input_shape),  # (128, 128)

        # CNN layers
        layers.Reshape((input_shape[0], input_shape[1], 1)),
        layers.Conv2D(32, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Reshape((-1, 64)),  # Flatten spatial dims, keep time

        # LSTM layers
        layers.LSTM(64),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model


In [25]:
model = build_cnn_lstm_model((128, 128), num_classes=len(label_encoder.classes_))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(dataset, epochs=10)


Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 178ms/step - accuracy: 0.0381 - loss: 3.5285
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 194ms/step - accuracy: 0.0607 - loss: 3.5291
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 200ms/step - accuracy: 0.0000e+00 - loss: 3.5250
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 197ms/step - accuracy: 0.0225 - loss: 3.5257
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 191ms/step - accuracy: 0.0381 - loss: 3.5251
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 198ms/step - accuracy: 0.0225 - loss: 3.5284
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 197ms/step - accuracy: 0.0225 - loss: 3.5252
Epoch 8/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 191ms/step - accuracy: 0.0381 - loss: 3.5275
Epoch 9/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x2de056df5e0>

In [24]:
model.save("cnn_lstm_music_classifier.h5")


