<a href="https://colab.research.google.com/github/samuveljebakumar/fish/blob/main/okkkkkkkkkkkkkkk_using_the_cnn_and_traformtion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import (
    Input, Conv2D, MaxPooling2D, BatchNormalization,
    Dropout, Reshape, Dense, LayerNormalization,
    MultiHeadAttention, Add, GlobalAveragePooling1D
)
from tensorflow.keras.models import Model


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATASET_PATH = "/content/drive/MyDrive/all data set"

SAMPLE_RATE = 16000
DURATION = 3
SAMPLES = SAMPLE_RATE * DURATION

N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512


In [None]:
def load_audio(path):
    audio, _ = librosa.load(path, sr=SAMPLE_RATE, mono=True)
    audio = librosa.util.normalize(audio)
    return audio

def split_audio(audio):
    return [audio[i:i+SAMPLES] for i in range(0, len(audio)-SAMPLES, SAMPLES)]

def extract_mel(clip):
    mel = librosa.feature.melspectrogram(
        y=clip, sr=SAMPLE_RATE,
        n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS
    )
    return librosa.power_to_db(mel, ref=np.max)


In [None]:
X, y = [], []

for category in os.listdir(DATASET_PATH):
    category_path = os.path.join(DATASET_PATH, category)
    if not os.path.isdir(category_path):
        continue

    for species in os.listdir(category_path):
        species_path = os.path.join(category_path, species)
        if not os.path.isdir(species_path):
            continue

        for file in os.listdir(species_path):
            file_path = os.path.join(species_path, file)
            try:
                audio = load_audio(file_path)
                clips = split_audio(audio)

                for clip in clips:
                    mel = extract_mel(clip)
                    X.append(mel)
                    y.append(species)
            except:
                pass

X = np.array(X)[..., np.newaxis]
encoder = LabelEncoder()
y_enc = encoder.fit_transform(y)
y_cat = to_categorical(y_enc)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42
)


In [None]:
def transformer_block(x, heads=4, dim=128):
    attn = MultiHeadAttention(num_heads=heads, key_dim=dim)(x, x)
    x = Add()([x, attn])
    x = LayerNormalization()(x)

    ff = Dense(dim, activation='relu')(x)
    x = Add()([x, ff])
    return LayerNormalization()(x)


In [None]:
inputs = Input(shape=X.shape[1:])

# CNN Feature Extractor
x = Conv2D(32, (3,3), activation="relu")(inputs)
x = BatchNormalization()(x)
x = MaxPooling2D((2,2))(x)

x = Conv2D(64, (3,3), activation="relu")(x)
x = BatchNormalization()(x)
x = MaxPooling2D((2,2))(x)

# Reshape for Transformer
x = Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)

# ðŸ”¥ Projection (CRITICAL)
x = Dense(128)(x)

# Transformer
x = transformer_block(x) # Changed from transformer_encoder to transformer_block based on available function
x = transformer_block(x) # Changed from transformer_encoder to transformer_block based on available function

# Classification
x = GlobalAveragePooling1D()(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.4)(x)

outputs = Dense(y_cat.shape[1], activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=16,
    validation_data=(X_test, y_test)
)


In [None]:
loss, acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", acc * 100)


In [None]:
def predict_bird(audio_path):
    audio = load_audio(audio_path)
    clips = split_audio(audio)

    mels = [extract_mel(c) for c in clips]
    X_new = np.array(mels)[..., np.newaxis]

    preds = model.predict(X_new)
    avg_pred = np.mean(preds, axis=0)

    return encoder.inverse_transform([np.argmax(avg_pred)])[0]



In [None]:
# Example usage with a file from the test dataset (replace with your new audio file path)
new_audio_file ='/content/XC1030637 - Asian Green Bee-eater - Merops orientalis.mp3'

predicted_bird = predict_bird(new_audio_file)

if predicted_bird:
    print(f"The predicted bird species is: {predicted_bird}")