 # <p style="color:blue"><center> Introduction</center></p>
**We all know that birds are the creatures that are most vulnerable to risks and that they face various types of these many dangers, so we, as data scientists, must provide the best we have to help the organizations that are in the eyes of the eye specializing in helping birds and the organizations responsible for that and provide the best analyzes for them.**

![Birdes](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQM_tI5u29WHhcyBilT5vMfjmywWqtS-9Iscw&usqp=CAU)
![Birdes](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT8-TegC4ENTy0ywDXXLGWZkitqi_7s32gEmA&usqp=CAU)

**Recent advances in robotic listening have improved audio data collection. However, generating and retrieving analysis outputs with high accuracy remains a challenge. The majority of the data has not been examined due to the lack of effective tools to efficiently and reliably extract signals of interest (for example, bird calls).
The Hua Laboratory is the most popular, best and much interested laboratory, dedicated to enhancing the understanding and protection of birds and the natural world. The lab joins people from all walks of life to make new scientific discoveries, share ideas, and stimulate conservation actions. In this competition, they collaborate with Google Research, LifeCLEF, and Xeno-canto.**

# **Import some used Libraries**

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf

# import dataset 

In [None]:
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

import plotly.graph_objects as go
import plotly.express as px
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
import librosa
import librosa.display
import IPython.display as ipd
from sklearn.model_selection import train_test_split

from keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import VGG19, VGG16, ResNet50

import sklearn
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_meta=pd.read_csv('../input/birdclef-2021/train_metadata.csv')
train_sound=pd.read_csv('../input/birdclef-2021/train_soundscape_labels.csv')
test_df=pd.read_csv('../input/birdclef-2021/test.csv')
sample_submission=pd.read_csv('../input/birdclef-2021/sample_submission.csv')

# read dataset

In [None]:
train_meta.head(10)

In [None]:
train_sound.head()

In [None]:
path = '/kaggle/input/birdclef-2021/'
os.listdir(path)

In [None]:
test_df.head()

In [None]:
train_meta.shape

# showing some of information about data

In [None]:
train_meta.info()

# describe data

In [None]:
train_meta.describe()

# Clarify if there are empty cells in the data or not

In [None]:
train_meta.isnull()

# Now showing the tail of data

In [None]:
train_meta.tail()

In [None]:
row = 0
train_meta.iloc[row]

# showing columns of data

In [None]:
train_meta.columns 

# Show chart


In [None]:
species = train_meta['primary_label'].value_counts()
fig = go.Figure(data=[go.Bar(y=species.values, x=species.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=1, b=5, t=10)))

fig.update_layout(title='Number of traning samples per species')
fig.show()

# Because we specify the time for the sound to use

**Here we will show you some of the audio and improve it for use**

In [None]:
train_meta['year'] = train_meta['date'].apply(lambda x: x.split('-')[0])
train_meta['month'] = train_meta['date'].apply(lambda x: x.split('-')[1])
train_meta['day_of_month'] = train_meta['date'].apply(lambda x: x.split('-')[2])

In [None]:
label = train_meta.loc[row, 'primary_label']
filename = train_meta.loc[row, 'filename']

# Check if the file is in the folder
filename in os.listdir(path+'train_short_audio/'+label)
data, samplerate = sf.read(path+'train_short_audio/'+label+'/'+filename)
print(data[:8])
print(samplerate)

# Here we created or called this function to make and help with the sound to get the graph out

In [None]:
def read_ogg_file(path, file):
    """ Read ogg audio file and return numpay array and samplerate"""
    
    data, samplerate = sf.read(path+file)
    return data, samplerate


def plot_audio_file(data, samplerate):
    """ Plot the audio data"""
    
    sr = samplerate
    fig = plt.figure(figsize=(8, 4))
    x = range(len(data))
    y = data
    plt.plot(x, y)
    plt.plot(x, y, color='red')
    plt.legend(loc='upper center')
    plt.grid()
    
    
def plot_spectrogram(data, samplerate):
    """ Plot spectrogram with mel scaling """
    
    sr = samplerate
    spectrogram = librosa.feature.melspectrogram(data, sr=sr)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    librosa.display.specshow(log_spectrogram, sr=sr, x_axis='time', y_axis='mel')

In [None]:
plot_audio_file(data, samplerate)

# ploting spectrogram to showing result 

In [None]:
plot_spectrogram(data, samplerate)

# Do some analyzes for the data

In [None]:
train_sound['audio_id'].unique()

In [None]:
train_sound.groupby(by=['audio_id']).count()['birds'][:4]

In [None]:
labels = []
for row in train_sound.index:
    labels.extend(train_sound.loc[row, 'birds'].split(' '))
labels = list(set(labels))

print('Number of unique bird labels:', len(labels))

In [None]:
df_labels_train = pd.DataFrame(index=train_sound.index, columns=labels)
for row in train_sound.index:
    birds = train_sound.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_train.loc[row, bird] = 1
df_labels_train.fillna(0, inplace=True)

# We set a dummy value for the target label in the test data because we will need for the Data Generator
test_df['birds'] = 'nocall'

df_labels_test = pd.DataFrame(index=test_df.index, columns=labels)
for row in test_df.index:
    birds = test_df.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_test.loc[row, bird] = 1
df_labels_test.fillna(0, inplace=True)
df_labels_train.sum().sort_values(ascending=False)[:20]

# Now we're going to do a simple merge between data train_sound and test_df

In [None]:
train_sound = pd.concat([train_sound, df_labels_train], axis=1)
test_df = pd.concat([test_df, df_labels_test], axis=1)
print(train_sound)

In [None]:
print(test_df)

# Now we're going to plot this matrix to see what the result is

In [None]:
file = os.listdir(path+'train_soundscapes')[0]
file
data, samplerate = read_ogg_file(path+'train_soundscapes/', file)

sub_data = data[int(455/5)*160000:int(460/5)*160000]
plt.figure(figsize=(14, 5))
librosa.display.waveplot(sub_data, sr=samplerate)
plt.grid()
plt.show()

# Now we can hear some voices

In [None]:
import librosa
audio_data = '../input/birdclef-2021/train_short_audio/acafly/XC109605.ogg'
x , sr = librosa.load(audio_data)
print(type(x), type(sr))
print(x.shape, sr)

In [None]:
import IPython.display as ipd
ipd.Audio(audio_data)

# This is a drawing of the recorded external sound

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x, sr=sr)

In [None]:
df=sample_submission
df.to_csv('submission.csv', index=False)

In [None]:
df