# Train and validtion split

This kernel takes the train_metadata.csv and splits it to train and validation sets. Each sample in the sets consists of a 5-seconds segment from an audio file.

This kernel works like this:
1. Each bird has some audio files. Split those audio files to train and validation.
2. For each audio file, split it to segments of 5 seconds.
3. The data is extremely unbalanced - I sample again and again segments in order to balance the data (I trust the augmentation so that the network will *not* see the same example over and over

In [None]:
import pandas as pd
import numpy as np
import torchaudio
import json
from tqdm import tqdm

In [None]:
BASE_DIR = '../input/birdclef-2022'
TRAIN_RATIO = 2.0/3.0

metadata = pd.read_csv(f'{BASE_DIR}/train_metadata.csv')
with open(f'{BASE_DIR}/scored_birds.json') as json_file:
    scored_birds = set(json.load(json_file))

### Phase 1 - split audio files to train and validation

In [None]:
def sample(df: pd.DataFrame):
    frac = TRAIN_RATIO
    if len(df) == 2:
        frac = 0.5

    if len(df) > 1:
        train = df.sample(frac=frac, random_state=123)
        val = df.drop(train.index)
        train['start'] = train['end'] = val['start'] = val['end'] = np.nan
    else:
        # In case a bird has only only one file, split it in the middle of the file
        frames, _ = torchaudio.load('../input/birdclef-2022/train_audio/' + df.iloc[0].filename)
        n_frames = frames.shape[1]
        train = df.copy()
        val = df.copy()
        train['start'] = 0
        train['end'] = val['start'] = int(TRAIN_RATIO*n_frames)
        val['end'] = n_frames

    return train, val


# Split the metadata dataframe into 2 dataframes - train and validation
def train_val_split(metadata_df, scored_birds):
    train_df_arr = []
    val_df_arr = []

    # First split all the scored birds one by one
    for bird in scored_birds:
        curr_metadata = metadata_df[metadata_df.primary_label == bird]
        curr_train, curr_val = sample(curr_metadata)
        train_df_arr.append(curr_train)
        val_df_arr.append(curr_val)
        metadata_df = metadata_df.drop(curr_metadata.index)

    # Now split all the other birds metadata.
    curr_train, curr_val = sample(metadata_df)
    train_df_arr.append(curr_train)
    val_df_arr.append(curr_val)
    return pd.concat(train_df_arr), pd.concat(val_df_arr)

In [None]:
train_metadata, val_metadata = train_val_split(metadata, scored_birds)

### Phase 2 - split audio files to train and validation
At this point, I assume that most of the segments will contain the sound of the relevant bird. This may not be correct to all of the segments, but this is a point to begin with.

In [None]:
def split_row_to_segments(row):
    frames, frame_rate = torchaudio.load('../input/birdclef-2022/train_audio/' + row.filename)
    start = 0
    end = frames.shape[1]
    if row['start'] is not None and not np.isnan(row['start']):
        start = int(row['start'])
    if row['end'] is not None and not np.isnan(row['end']):
        end = int(row['end'])

    segments = [s for s in range(start, end, 5*frame_rate)]
    # If the last segment is less than a second seconds, remove it
    if end - segments[-1] < frame_rate:
        del segments[-1]

    res = []
    for s in segments:
        row_res = \
            {
                'primary_label': row.primary_label,
                'secondary_labels': row.secondary_labels,
                'filename': row.filename,
                'start': s,
                'end': min(s + 5 * frame_rate, frames.shape[1]),
                'rating': row.rating
            }
        res.append(row_res)

    return pd.DataFrame(res)


def split_to_segments(metadata_df):
    segments_df_arr = []
    for _, row in tqdm(metadata_df.iterrows(), total=len(metadata_df)):
        segments_df_arr.append(split_row_to_segments(row))
    return pd.concat(segments_df_arr, ignore_index=True)

In [None]:
train_samples = split_to_segments(train_metadata)
val_samples = split_to_segments(val_metadata)

### Phase 3 - sample again and again segments in order to balance the data.

In [None]:
def count_num_primary(samples, scored_birds):
    num_primary = {bird: 0 for bird in scored_birds}
    for bird, group in samples.groupby('primary_label'):
        if bird in scored_birds:
            num_primary[bird] = len(group)
    return num_primary

In [None]:
def over_sample(samples, scored_birds):
    num_primary = count_num_primary(samples, scored_birds)
    mx = np.max(list(num_primary.values()))
    df_arr = []
    for bird, group in samples.groupby('primary_label'):
        if bird not in scored_birds:
            df_arr.append(group)
        else:
            n_multiply = mx // num_primary[bird]
            mod = mx % num_primary[bird]
            df_arr += [group]*n_multiply + [group.sample(n=mod)]
    res = pd.concat(df_arr, ignore_index=True)

    return res

In [None]:
# Print the inbalance of the data before
print(count_num_primary(train_samples, scored_birds))

In [None]:
train_samples = over_sample(train_samples, scored_birds)
val_samples_oversampled = over_sample(val_samples, scored_birds)

In [None]:
# Print the balanceness of the data after
print(count_num_primary(train_samples, scored_birds))

In [None]:
train_samples.to_csv('train_samples.csv', index=False)
val_samples.to_csv('val_samples.csv', index=False)
val_samples_oversampled.to_csv('val_samples_oversampled.csv', index=False)

### farther work
This is a point to start training, but there are still improvments that should be done:

- In phase 1: Split the train and validation audio files, so that the training set will contain at least one file per a bird voice type ("type" column in train_metadata.csv)
- In phase 2, try to filter out segments containing only background.