In [1]:
import pandas as pd

# Load the metadata file
metadata_path = '/kaggle/input/birdclef-2022/train_metadata.csv'
metadata_df = pd.read_csv(metadata_path)

# Display the total number of samples
total_samples = len(metadata_df)
print(f"Total number of samples: {total_samples}")

# Display different ratings present and the number of samples per each rating
ratings_count = metadata_df['rating'].value_counts().sort_index()
print("\nNumber of samples per each rating:")
print(ratings_count)


Total number of samples: 14852

Number of samples per each rating:
rating
0.0     570
0.5      30
1.0     155
1.5     152
2.0     575
2.5     724
3.0    1957
3.5    1765
4.0    3974
4.5    1632
5.0    3318
Name: count, dtype: int64


In [1]:
import pandas as pd

# Load the taxonomy file
taxonomy_path = '/kaggle/input/birdclef-2022/eBird_Taxonomy_v2021.csv'
taxonomy_df = pd.read_csv(taxonomy_path)

# List of columns to analyze
columns_to_analyze = ['CATEGORY', 'SPECIES_CODE', 'PRIMARY_COM_NAME', 'SCI_NAME', 'ORDER1', 'FAMILY', 'SPECIES_GROUP']

# Function to analyze each column
def analyze_column(df, column_name):
    print(f"\nColumn: {column_name}")
    
    # Total number of unique values
    unique_values = df[column_name].nunique()
    print(f"Total number of unique values: {unique_values}")
    
    # Number of missing values
    missing_values = df[column_name].isnull().sum()
    print(f"Number of missing values: {missing_values}")
    
    # Count of occurrences for each unique value
    value_counts = df[column_name].value_counts()
    print(f"\nCount of occurrences for each unique value:\n{value_counts}")

# Analyze each column in the list
for column in columns_to_analyze:
    analyze_column(taxonomy_df, column)



Column: CATEGORY
Total number of unique values: 8
Number of missing values: 0

Count of occurrences for each unique value:
CATEGORY
species       10824
issf           3761
slash           788
spuh            666
hybrid          545
form            120
intergrade       34
domestic         15
Name: count, dtype: int64

Column: SPECIES_CODE
Total number of unique values: 16753
Number of missing values: 0

Count of occurrences for each unique value:
SPECIES_CODE
ostric2    1
rolcis2    1
rolcis4    1
rolcis3    1
borcis1    1
          ..
grebar3    1
linbar1    1
y00399     1
brhbar1    1
bird1      1
Name: count, Length: 16753, dtype: int64

Column: PRIMARY_COM_NAME
Total number of unique values: 16753
Number of missing values: 0

Count of occurrences for each unique value:
PRIMARY_COM_NAME
Common Ostrich                    1
Rock-loving Cisticola             1
Rock-loving Cisticola (Huambo)    1
Rock-loving Cisticola (Lazy)      1
Boran Cisticola                   1
                   

In [1]:
import pandas as pd

# Load the metadata file
metadata_path = '/kaggle/input/birdclef-2022/train_metadata.csv'
metadata_df = pd.read_csv(metadata_path)

# Filter the DataFrame to include only samples with rating >= 3
filtered_metadata_df = metadata_df[metadata_df['rating'] >= 3]

# Display the total number of filtered samples
filtered_samples = len(filtered_metadata_df)
print(f"Total number of samples with rating >= 3: {filtered_samples}")

# Display different ratings present in the filtered DataFrame and the number of samples per each rating
filtered_ratings_count = filtered_metadata_df['rating'].value_counts().sort_index()
print("\nNumber of samples per each rating (rating >= 3):")
print(filtered_ratings_count)


Total number of samples with rating >= 3: 12646

Number of samples per each rating (rating >= 3):
rating
3.0    1957
3.5    1765
4.0    3974
4.5    1632
5.0    3318
Name: count, dtype: int64


In [2]:
# List of columns to analyze
columns_to_analyze = ['primary_label', 'secondary_labels', 'type']

# Function to analyze each column
def analyze_column(df, column_name):
    print(f"\nColumn: {column_name}")
    
    # Total number of unique values
    unique_values = df[column_name].nunique()
    print(f"Total number of unique values: {unique_values}")
    
    # Number of missing values
    missing_values = df[column_name].isnull().sum()
    print(f"Number of missing values: {missing_values}")
    
    # Count of occurrences for each unique value
    value_counts = df[column_name].value_counts()
    print(f"\nCount of occurrences for each unique value:\n{value_counts}")

# Analyze each column in the list
for column in columns_to_analyze:
    analyze_column(filtered_metadata_df, column)



Column: primary_label
Total number of unique values: 151
Number of missing values: 0

Count of occurrences for each unique value:
primary_label
mallar3    441
norcar     441
skylar     427
brnowl     421
bcnher     400
          ... 
akikik       2
layalb       2
hawpet1      2
hawhaw       2
maupar       1
Name: count, Length: 151, dtype: int64

Column: secondary_labels
Total number of unique values: 249
Number of missing values: 0

Count of occurrences for each unique value:
secondary_labels
[]                                                 11451
['houspa']                                            81
['mallar3']                                           59
['moudov']                                            58
['houfin']                                            43
                                                   ...  
['comgal1', 'lesyel']                                  1
['cacgoo1']                                            1
['redjun', 'wesmea']                        

In [4]:
from collections import Counter

# List of columns to analyze that contain lists as strings
columns_to_analyze = ['secondary_labels', 'type']

# Function to analyze each column with individual string components
def analyze_column_individual(df, column_name):
    print(f"\nAnalyzing individual values within the column: {column_name}")
    
    # Initialize a Counter to count each unique item across all entries
    item_counter = Counter()
    
    # Iterate over each entry in the column
    for entry in df[column_name].dropna():
        # Convert the string to a list (assuming entries are in the format "['item1', 'item2', ...]")
        items = eval(entry)  # use `eval` with caution; assumes consistent, trusted data format
        # Update the counter with items in the list
        item_counter.update(items)
    
    # Print total unique items and counts for each
    unique_items = len(item_counter)
    print(f"Total number of unique items: {unique_items}")
    print(f"\nCount of occurrences for each unique item:\n{item_counter}")

# Analyze each column in the list
for column in columns_to_analyze:
    analyze_column_individual(filtered_metadata_df, column)



Analyzing individual values within the column: secondary_labels
Total number of unique items: 100

Count of occurrences for each unique item:
Counter({'moudov': 127, 'houspa': 106, 'houfin': 100, 'mallar3': 82, 'gamqua': 75, 'apapan': 67, 'normoc': 62, 'norcar': 61, 'wesmea': 61, 'cangoo': 56, 'iiwi': 42, 'skylar': 41, 'warwhe1': 40, 'bknsti': 34, 'hawama': 32, 'gnwtea': 30, 'zebdov': 28, 'comgal1': 26, 'commyn': 24, 'dunlin': 22, 'arcter': 19, 'rinphe': 18, 'eurwig': 16, 'lesyel': 15, 'omao': 14, 'laugul': 14, 'redjun': 13, 'belkin1': 13, 'amewig': 13, 'rorpar': 13, 'leasan': 12, 'ribgul': 12, 'gadwal': 12, 'calqua': 12, 'norsho': 11, 'pibgre': 11, 'bkbplo': 11, 'lobdow': 11, 'sora': 9, 'spodov': 9, 'jabwar': 9, 'glwgul': 9, 'osprey': 8, 'rudtur': 8, 'norpin': 8, 'wiltur': 8, 'grbher3': 7, 'rocpig': 6, 'leater1': 5, 'snogoo': 5, 'caster1': 5, 'comwax': 5, 'bongul': 5, 'bcnher': 4, 'categr': 4, 'semplo': 4, 'cacgoo1': 4, 'fragul': 4, 'pecsan': 4, 'kauama': 3, 'aniani': 3, 'brant': 3, 

In [5]:
from collections import Counter

# List of columns to analyze that contain lists as strings
columns_to_analyze = ['secondary_labels', 'type']

# Function to analyze each column with individual string components (case-insensitive)
def analyze_column_individual_case_insensitive(df, column_name):
    print(f"\nAnalyzing individual values within the column (case-insensitive): {column_name}")
    
    # Initialize a Counter to count each unique item across all entries
    item_counter = Counter()
    
    # Iterate over each entry in the column
    for entry in df[column_name].dropna():
        # Convert the string to a list and make each item lowercase
        items = [item.lower() for item in eval(entry)]  # Converts each item to lowercase
        # Update the counter with items in the list
        item_counter.update(items)
    
    # Print total unique items and counts for each
    unique_items = len(item_counter)
    print(f"Total number of unique items: {unique_items}")
    print(f"\nCount of occurrences for each unique item (case-insensitive):\n{item_counter}")

# Analyze each column in the list
for column in columns_to_analyze:
    analyze_column_individual_case_insensitive(filtered_metadata_df, column)



Analyzing individual values within the column (case-insensitive): secondary_labels
Total number of unique items: 100

Count of occurrences for each unique item (case-insensitive):
Counter({'moudov': 127, 'houspa': 106, 'houfin': 100, 'mallar3': 82, 'gamqua': 75, 'apapan': 67, 'normoc': 62, 'norcar': 61, 'wesmea': 61, 'cangoo': 56, 'iiwi': 42, 'skylar': 41, 'warwhe1': 40, 'bknsti': 34, 'hawama': 32, 'gnwtea': 30, 'zebdov': 28, 'comgal1': 26, 'commyn': 24, 'dunlin': 22, 'arcter': 19, 'rinphe': 18, 'eurwig': 16, 'lesyel': 15, 'omao': 14, 'laugul': 14, 'redjun': 13, 'belkin1': 13, 'amewig': 13, 'rorpar': 13, 'leasan': 12, 'ribgul': 12, 'gadwal': 12, 'calqua': 12, 'norsho': 11, 'pibgre': 11, 'bkbplo': 11, 'lobdow': 11, 'sora': 9, 'spodov': 9, 'jabwar': 9, 'glwgul': 9, 'osprey': 8, 'rudtur': 8, 'norpin': 8, 'wiltur': 8, 'grbher3': 7, 'rocpig': 6, 'leater1': 5, 'snogoo': 5, 'caster1': 5, 'comwax': 5, 'bongul': 5, 'bcnher': 4, 'categr': 4, 'semplo': 4, 'cacgoo1': 4, 'fragul': 4, 'pecsan': 4, 