In [None]:
import pandas as pd
import numpy as np
import os

# Checking if we work with public or private test set
testdir = os.listdir('../input/birdclef-2021/test_soundscapes/')
path = '../input/birdclef-2021/train_soundscapes/'
for t in testdir:
    if ".ogg" in t:
        path = '../input/birdclef-2021/test_soundscapes/'

# Private test set has non-sound files, filter them out
files = os.listdir(path)
files2 = []
for f in files:
    if ".ogg" in f:
        files2.append(f)

# For time (360 days) or longitude (360 degrees) distance
def long_dist(a, b):
    if np.abs(a - b) > 180:
        return 360 - np.abs(a - b)
    else:
        return np.abs(a - b)

# Reading train metadata
df = pd.read_csv("../input/birdclef-2021/train_metadata.csv")
df1 = df[['primary_label', 'latitude', 'longitude', 'date']]
df1[['latitude', 'longitude']] = df1[['latitude', 'longitude']].apply(pd.to_numeric)

geotime_dict = {}
for i in range(0, df1.shape[0]):
    row = df1.iloc[i]
    name = row[0]
    lat = row[1]
    long = row[2]
    date = row[3]
    day_of_year = int(str.split(date, "-")[1]) * 30 + int(str.split(date, "-")[2])

    if name not in geotime_dict:
        geotime_dict[name] = []

    point_loc = [lat, long, day_of_year]
    geotime_dict[name].append(point_loc)

# Soft lock - no hard threshold, just declining probabilities
def soft_lock(geotime, lock_begin, lock_end):
    if geotime < lock_begin:
        geotime = 1
    elif geotime < lock_end:
        geotime = (lock_end - geotime) / (lock_end - lock_begin)
    else:
        geotime = 0
    geotime = np.round(geotime, 3)
    return geotime

def get_birds_possibility(day, latitude, longitude, percent_cases, day_to_degree, geotime_lock_begin, geotime_lock_end):
    geotime_array = []
    for bird in geotime_dict:
        # Populate array with all train recordings for a bird
        geotime = []
        for loc in geotime_dict[bird]:
            lat_diff = np.abs(loc[0] - latitude)
            long_diff = long_dist(loc[1], longitude)
            time_diff = long_dist(loc[2], day) * day_to_degree

            # 3d distance with scaled time (~1.8 degrees/month by default) as third dimension
            geotime_diff = np.sqrt(lat_diff * lat_diff + long_diff * long_diff + time_diff * time_diff)
            geotime.append(geotime_diff)

        geotime = sorted(geotime)

        # Get distance of n% closest bird
        dist_len = int(np.ceil(len(geotime) * percent_cases / 100))
        geotime = geotime[0:dist_len]
        geotime = geotime[-1]

        geotime = soft_lock(geotime, geotime_lock_begin, geotime_lock_end)
        geotime_array.append(geotime)

    return geotime_array

# Get recording site and date
def get_lat_long_time(f):
    fdate = f.split("_")[2]
    fdate = fdate.split(".")[0]
    month = int(fdate[4:6])
    day = int(fdate[6:8])
    day_of_year = month * 30 + day

    point_loc = {'COL': [5.57, -75.85], 'COR': [10.12, -84.51], 'SNE': [38.49, -119.95], 'SSW': [42.47, -76.45]}
    scape_lat = 0
    scape_long = 0
    if "COL" in f:
        scape_lat = point_loc['COL'][0]
        scape_long = point_loc['COL'][1]
    if "COR" in f:
        scape_lat = point_loc['COR'][0]
        scape_long = point_loc['COR'][1]
    if "SNE" in f:
        scape_lat = point_loc['SNE'][0]
        scape_long = point_loc['SNE'][1]
    if "SSW" in f:
        scape_lat = point_loc['SSW'][0]
        scape_long = point_loc['SSW'][1]

    return day_of_year, scape_lat, scape_long

geotimelock_perfile = {}
for f in files2:
    day, scape_lat, scape_long = get_lat_long_time(f)
    
    # Adjusted for my models by validating on train soundscapes;
    geotime_lock_array = get_birds_possibility(day, scape_lat, scape_long, 4, 0.06, 5, 7)
    
    # Optional adjustment that might depend on your model:
    # Another post-processing step that i performed was to check if any bird had a lot of false positives, by validating on short audio clips
    # Majority of them were okay (~30% ratio of false positive/true positive avg), but grhowl (and only grhowl) had more false positives than true positives
    # Any sufficiently noisy recording was predicted as grhowl. By deleting grhowl CV improved by 0.016
    # grhowl is evil. Woo-Hoo. Hoo. Hoo.
    
    # geotime_lock_array[164] = 0
    
    geotimelock_perfile[f] = geotime_lock_array
    print("For soundscape " + f + " geotime lock is: " + str(geotime_lock_array))

# Usage instructions

# ...Somewhere later in actual predicting code of your submission
# Sigmoid and 0..1 values are expected for answers
# answers is 397-long array of your model/ensemble prediction on a 5-second soundscape slice or equivalent

# answers = answers * geotimelock_perfile[current_file]

# Warning: Using such post-processing might drastically change optimal threshold.
# Peak validation F1 on train soundscapes on me:
#   No geotime lock and no grhowl deletion: 0.465/0.3, 0.625/0.5, 0.668/0.7, peak 0.67/0.74
#   No geotime lock:                        0.605/0.3, 0.686/0.5, 0.671/0.7, peak 0.686/0.5
#   With geotime lock:                      0.738/0.3, 0.713/0.5, 0.682/0.7, peak 0.741/0.32
# Make sure that you adjust your threshold based on train soundscapes validation before submitting!