In [1]:
import numpy as np
import pandas as pd
import os
import librosa
from sklearn.preprocessing import StandardScaler, LabelEncoder
from joblib import Parallel, delayed
import scipy.signal as signal
from tqdm import tqdm
import gc

In [2]:
metadata_df = pd.read_csv('/kaggle/input/birdclef-2022/train_metadata.csv')
taxonomy_df = pd.read_csv('/kaggle/input/birdclef-2022/eBird_Taxonomy_v2021.csv')

#filtering the metadata to havings ratings of >= 3
filtered_metadata_df = metadata_df[metadata_df['rating'] >= 3]
print(f"\nTotal number of after filtering ratings of >= 3:")
len(filtered_metadata_df)


Total number of after filtering ratings of >= 3:


12646

In [3]:
from collections import Counter
import re

# List of columns to analyze
columns_to_analyze = ['secondary_labels', 'type']

# Function to analyze each column with individual string components, case-insensitive and space-insensitive
def analyze_column_individual_case_insensitive(df, column_name):
    print(f"\nAnalyzing individual values within the column (case-insensitive and space-insensitive): {column_name}")
    
    # Initialize a Counter to count each unique item across all entries
    item_counter = Counter()
    
    # Iterate over each entry in the column
    for entry in df[column_name].dropna():
        # Convert the string to a list, make each item lowercase, and remove spaces
        items = [re.sub(r'\s+', '', item.lower()) for item in eval(entry)]  # Lowercase and remove spaces
        # Update the counter with items in the list
        item_counter.update(items)
    
    # Print total unique items and counts for each
    #unique_items = len(item_counter)
    #print(f"Total number of unique items: {unique_items}")
    #print(f"\nCount of occurrences for each unique item (case-insensitive and space-insensitive):\n{item_counter}")
    
    return item_counter

# Analyze the 'type' column
type_counts = analyze_column_individual_case_insensitive(filtered_metadata_df, 'type')

noisy_terms = ['wing', 'wings', 'water', 'splash', 'rain', 'ground', 'background', 'noise', 'anthropogenic', 'traffic', 'street']

# Filter types that contain any of the noisy terms
noisy_types = [item for item in type_counts if any(term in item for term in noisy_terms)]

print("\nTypes identified as containing noise (including specified terms):")
print(noisy_types)

# Function to filter DataFrame by removing rows where all 'type' values match noisy types
def filter_noisy_rows(df, noisy_types):
    def has_only_noisy_types(type_list):
        # Clean and normalize type_list
        items = [re.sub(r'\s+', '', item.lower()) for item in eval(type_list)]
        # Check if all items are in noisy_types
        return all(item in noisy_types for item in items)
    
    # Filter the DataFrame
    filtered_df = df[~df['type'].apply(has_only_noisy_types)]
    
    return filtered_df

# Apply the filter to remove rows with only noisy types
filtered_metadata_df_cleaned = filter_noisy_rows(filtered_metadata_df, noisy_types)

# Display the total number of filtered samples
print(f"\nTotal number of samples without noisy types:")
len(filtered_metadata_df_cleaned)


Analyzing individual values within the column (case-insensitive and space-insensitive): type

Types identified as containing noise (including specified terms):
['traffic', 'wingflutters', 'wingsounds', 'wingsbeats', 'insectsoundsinbackground', 'ducksflushingoffwater', 'anthropogenic', 'rain', 'wings', 'trafficonfreeway', 'anthropogenic:', 'wingbeats', 'wingflapping', 'wingnoise', 'wingsinflight', 'wingsoundsascoveyflushed', 'wingwhir', 'wingflaps', 'wingbeatasflyingover', 'splash', 'waterdroplet', 'callsonwater', 'communicatingwhilefeedingontheground', 'anthropogenicbackgroundnoise', 'whistlesofduckwingsinflight', 'callfromthewater', 'noiseofwings', 'wingsplashes', 'noiseofnestingcolony', 'streettraffic', 'callsfromground', 'watersplashing', 'alarmfromwater', 'inclwingbeats', 'wingbeat', 'somdobaterdasasas.-soundofbeatingofwings.', 'landingonwater', 'birdisflyingoutofthewater(wingssound)', 'audiblewingbeats', 'flapwings', 'greylaggoosewingflapping', 'landingonthewater', 'wingwhirr', '

12608

In [4]:
# Merge with taxonomy on scientific name
merged_df = pd.merge(filtered_metadata_df_cleaned, taxonomy_df, left_on='scientific_name', right_on='SCI_NAME', how='left')

# Drop specified columns after merging
merged_df = merged_df.drop(columns=['PRIMARY_COM_NAME', 'secondary_labels', 'author', 'license', 'rating', 'REPORT_AS', 'SCI_NAME', 'time', 'url', 'SPECIES_GROUP'])

# Display DataFrames after filtering and merging.
print("DataFrame after filtering and merging:")
merged_df.head()

DataFrame after filtering and merging:


Unnamed: 0,primary_label,type,latitude,longitude,scientific_name,common_name,filename,TAXON_ORDER,CATEGORY,SPECIES_CODE,ORDER1,FAMILY
0,afrsil1,['call'],19.8801,-155.7254,Euodice cantans,African Silverbill,afrsil1/XC175522.ogg,30031,species,afrsil1,Passeriformes,Estrildidae (Waxbills and Allies)
1,afrsil1,"['call', 'song']",16.2901,-16.0321,Euodice cantans,African Silverbill,afrsil1/XC177993.ogg,30031,species,afrsil1,Passeriformes,Estrildidae (Waxbills and Allies)
2,afrsil1,"['alarm call', 'call']",17.0922,54.2958,Euodice cantans,African Silverbill,afrsil1/XC205893.ogg,30031,species,afrsil1,Passeriformes,Estrildidae (Waxbills and Allies)
3,afrsil1,['flight call'],21.4581,-157.7252,Euodice cantans,African Silverbill,afrsil1/XC207431.ogg,30031,species,afrsil1,Passeriformes,Estrildidae (Waxbills and Allies)
4,afrsil1,['flight call'],21.4581,-157.7252,Euodice cantans,African Silverbill,afrsil1/XC207432.ogg,30031,species,afrsil1,Passeriformes,Estrildidae (Waxbills and Allies)


In [5]:
final_df = merged_df.drop(columns=['type', 'latitude', 'longitude', 'scientific_name', 'TAXON_ORDER', 'CATEGORY', 'SPECIES_CODE', 'FAMILY'])  

final_df.to_csv('final_df_3.csv', index=False)

final_df.head(50)

Unnamed: 0,primary_label,common_name,filename,ORDER1
0,afrsil1,African Silverbill,afrsil1/XC175522.ogg,Passeriformes
1,afrsil1,African Silverbill,afrsil1/XC177993.ogg,Passeriformes
2,afrsil1,African Silverbill,afrsil1/XC205893.ogg,Passeriformes
3,afrsil1,African Silverbill,afrsil1/XC207431.ogg,Passeriformes
4,afrsil1,African Silverbill,afrsil1/XC207432.ogg,Passeriformes
5,afrsil1,African Silverbill,afrsil1/XC209513.ogg,Passeriformes
6,afrsil1,African Silverbill,afrsil1/XC234994.ogg,Passeriformes
7,afrsil1,African Silverbill,afrsil1/XC317039.ogg,Passeriformes
8,afrsil1,African Silverbill,afrsil1/XC322742.ogg,Passeriformes
9,afrsil1,African Silverbill,afrsil1/XC344134.ogg,Passeriformes
