In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
sound_filenames=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if os.path.join(dirname, filename).split('/')[-1].split('.')[-1]=='ogg':
            sound_filenames.append(os.path.join(dirname, filename))
        
print("Total Sound Files : ",len(sound_filenames))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Data Preprocessing**

**Plotting Sound Wave of 5 audio file.**

In [None]:
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

def plot_wave(SOUND_DIR):
    # listen to the recording
    ipd.display(ipd.Audio(SOUND_DIR))
    # load the mp3 file
    signal, sr = librosa.load(SOUND_DIR, duration=10)  # sr = sampling rate
    print(signal,sr)
    # plot recording signal
    plt.figure(figsize=(10, 4))
    plt.plot(signal)
    plt.title("Sound Wave")
    plt.show()
    print('\n\n')
    
#Plotting Sound Wave of 5 audio file.
for path in sound_filenames[:5]:
    plot_wave(path)

**Plotting Spectogram of 5 audio files**

In [None]:
import librosa.display
# Plot spectogram
def plot_spectogram(SOUND_DIR):
    signal, sr = librosa.load(SOUND_DIR, duration=10)
    print(signal,sr)
    plt.figure(figsize=(10, 4))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(signal)), ref=np.max)
    librosa.display.specshow(D, y_axis="linear")
    plt.colorbar(format="%+2.0f dB")
    plt.title("Linear-frequency power spectrogram")
    plt.show()
    print('\n\n')

#Plotting Spectogram of 5 audio files
for path in sound_filenames[:5]:
    plot_spectogram(path)

**Classes Of Birds-**

In [None]:
IM_SIZE = (224, 224, 3)
dirList=os.listdir('/kaggle/input/birdclef-2022/train_audio/')
BIRDS=dirList
print('Classes of Birds : \n',BIRDS)
print('\nTotal Classes of Birds : ',len(BIRDS))

# Stratified K-Fold Cross Validation of train_metadata

**KFold is a cross-validator that divides the dataset into k folds. Stratified is to ensure that each fold of dataset has the same proportion of observations with a given label.**

In [None]:
from sklearn.model_selection import StratifiedKFold

SEED = 42
DATA_PATH = "../input/birdclef-2022/"
AUDIO_PATH = '../input/birdclef-2022/train_audio'
MEAN = np.array([0.485, 0.456, 0.406])
STD = np.array([0.229, 0.224, 0.225])
NUM_WORKERS = 4
CLASSES = sorted(os.listdir(AUDIO_PATH))
NUM_CLASSES = len(CLASSES)
class AudioParams:
    """
    Parameters used for the audio data
    """
    sr = 32000
    duration = 5
    # Melspectrogram
    n_mels = 224
    fmin = 20
    fmax = 16000


train = pd.read_csv('../input/birdclef-2022/train_metadata.csv')
train["file_path"] = AUDIO_PATH + '/' + train['filename']
paths = train["file_path"].values

Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for n, (trn_index, val_index) in enumerate(Fold.split(train, train['primary_label'])):
    train.loc[val_index, 'kfold'] = int(n)
train['kfold'] = train['kfold'].astype(int)

train.to_csv('train_folds.csv', index=False)

print(train.shape)
train.head()

# Test Data

In [None]:
test = pd.read_csv('/kaggle/input/birdclef-2022/test.csv')
test


In [None]:
test.info()

# scored_birds.json data

In [None]:
import json
with open('/kaggle/input/birdclef-2022/scored_birds.json') as f:
    scored_birds = json.load(f)
    
print('scored_birds is ', len(scored_birds), ' species')
print(scored_birds[0:7])
print(scored_birds[7:14])
print(scored_birds[14:22])

# eBird_Taxonomy_v2021.csv data

In [None]:
ebird = pd.read_csv('/kaggle/input/birdclef-2022/eBird_Taxonomy_v2021.csv')
ebird

In [None]:
ebird.info()

# Birds Singing Time

In [None]:
from datetime import timedelta
from datetime import datetime
import seaborn as sns

def round_date(date, delta = 30, th = 10):
    date = date.to_pydatetime()
    x = date.minute
    if ((x >= (delta - th)) & (x < delta)) or (x > (delta + th)):
#         print('Up')
        date = date + (datetime.min - date) % timedelta(minutes = delta)
    elif ((x <= (delta+ th )) & (x > delta)) or (x < (delta - th)):
#         print('down')
        date = date - (date - datetime.min) % timedelta(minutes = delta)

    
    return date.time().strftime("%H:%M")

train['time_tf']  = pd.to_datetime(train['time'], errors = 'coerce').dropna().apply(lambda x:round_date(x))
train.dropna(subset=['time_tf'], inplace = True)
print('success')

plt.figure(figsize = (15,8))
sns.countplot(x = 'time_tf', data = train.sort_values(by = 'time_tf'))
plt.xticks(rotation=45)
plt.xlabel('Time', fontdict = {'fontsize':18})
plt.ylabel('Frequency', fontdict = {'fontsize':18})
plt.title('Birds Singing Time',fontdict = {'fontsize':18})
plt.tight_layout()

# train['secondary_labels']

In [None]:
train['secondary_labels']

In [None]:
import ast
labels = []
for row in train.index:
    labels.extend(ast.literal_eval(train.loc[row, 'secondary_labels']))
labels = list(set(labels))

print('Number of unique bird labels:', len(labels))

In [None]:
import re
newTrain_SecondaryLabels = train['secondary_labels'].apply(lambda x: re.findall(r"'(\w+)'", x))

**Top 50 Birds Found on Background as Noise**

In [None]:
values = 50
fig, ax = plt.subplots( figsize = (10,8))
sns.barplot(y = 'index', x = 'secondary_labels',
            data = newTrain_SecondaryLabels.explode().value_counts().head(values).reset_index(),
            ax = ax)
ax.set_title(f'Top {values} Birds Found on Background as Noise', fontdict = {'fontsize':20})
ax.set_xlabel('Frequency', fontdict = {'fontsize':16})
ax.set_ylabel('Birds Common Name', fontdict = {'fontsize':16})

plt.tight_layout()

# Birds Distribution Map

In [None]:
import geopandas as gpd
fig, ax = plt.subplots(figsize=(26,20))
# plot map on axis
countries = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
countries.plot(color="lightgrey", ax=ax)

# plot points
cmap = plt.cm.get_cmap('jet')
birds = len(train["primary_label"].unique())
print("Total Classes of Birds : ",birds)

for i, (bird, dfg) in enumerate(train.groupby("primary_label")):
    dfg.longitude = np.around(dfg.longitude, 1)
    dfg.latitude = np.around(dfg.latitude, 1)
    dfgg = dfg.groupby(["longitude", "latitude"]).size().reset_index(name="counts")
    dfgg.plot(x="longitude", y="latitude", kind="scatter", 
              c=cmap(float(i) / birds), s=dfgg["counts"] * 5,
              ax=ax, label=bird, alpha=0.5)

ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25), ncol=15, fancybox=True, shadow=True)

# get axes limits
x_lo, x_up = ax.get_xlim()
y_lo, y_up = ax.get_ylim()
# add minor ticks with a specified sapcing (deg)
deg = 5
# add grid
ax.set_xticks(np.arange(np.ceil(x_lo), np.ceil(x_up), deg), minor=True)
ax.set_yticks(np.arange(np.ceil(y_lo), np.ceil(y_up), deg), minor=True)
ax.grid(b=True, which="minor", alpha=0.25)

In [None]:
submission = pd.read_csv('/kaggle/input/birdclef-2022/sample_submission.csv')
submission['target'] = True
submission.to_csv('submission.csv', index=False)
submission.head()