This kernel uses all the cores of the CPU using multiprocessing to clip 5 audio clips. I used this create the audio clip dataset in my local system but I am not able to upload the data. 

I have used W&B artifacts to upload the resulting CSV file that contains the metadata. You can use the below code snippet to download that. 

```
import wandb
run = wandb.init()
artifact = run.use_artifact('ayush-thakur/birdclef/audio_clips_5sec:v0', type='dataset')
artifact_dir = artifact.download()
```

# Imports and Setups

In [None]:
import os
os.environ['WANDB_SILENT'] = "true"
import re
import gc
import glob
import wandb
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
%matplotlib inline

# Multiprocessing 
from multiprocessing import Pool
from multiprocessing import cpu_count

# Audio specific imports
import librosa as lb
import librosa.display
import soundfile as sf

# W&B login
wandb.login()

In [None]:
TRAIN_FILES = glob.glob("../input/birdclef-2021/train_short_audio/*/*.ogg")
print(len(TRAIN_FILES))

In [None]:
SAVE_DIR = 'birdclef_5second/'
os.makedirs(SAVE_DIR, exist_ok=True)
!ls {SAVE_DIR}

# Create Audio Clips using Multiprocessing

In [None]:
def chunk(l, n):
	# loop over the list in n-sized chunks
	for i in range(0, len(l), n):
		# yield the current n-sized chunk to the calling function
		yield l[i: i + n]

In [None]:
procs = cpu_count()
print(procs)
procIDs = list(range(0, procs))
# grab the paths to the input images, then determine the number
# of images each process will handle
numImagesPerProc = len(TRAIN_FILES) / float(procs)
numImagesPerProc = int(np.ceil(numImagesPerProc))
# chunk the image paths into N (approximately) equal sets, one
# set of image paths for each individual process
chunkedPaths = list(chunk(TRAIN_FILES, numImagesPerProc))

In [None]:
def clip_audio_dataset(audio_paths):
    # Iterate over individual audio paths.
    for audio_path in tqdm(audio_paths):
        # Get label name
        label = audio_path.split('/')[-2]
        # Make dir
        os.makedirs(SAVE_DIR+label, exist_ok=True)
        # Load Audio 
        audio, sr = lb.load(audio_path)
        # Get the time duration of audio
        audio_time = len(audio)//sr

        start_sample = 0
        end_sample = sr*5 # sampling rate is number of samples per second. 

        for i in range(audio_time//5):
            # Get clip
            audio_clip = audio[start_sample:end_sample]
            start_sample = end_sample
            end_sample+=sr*5

            # Write as .ogg file
            file_name = audio_path.split('/')[-1].split('.')[0]
            sf.write(f'{SAVE_DIR+label}/{file_name}_{i}.ogg', audio_clip, sr, format='ogg', subtype='vorbis')

In [None]:
print("[INFO] launching pool using {} processes...".format(procs))
pool = Pool(processes=procs)
pool.imap(clip_audio_dataset, chunkedPaths)

In [None]:
# close the pool and wait for all processes to finish
print("[INFO] waiting for processes to finish...")
pool.close()
pool.join()
print("[INFO] multiprocessing complete")

In [None]:
print('The number of audio clip files generated: ', len(glob.glob(f"{SAVE_DIR}*/*.ogg")))

> This is almost 10 times the original number of recordings.a

# Create `train.csv` file

In [None]:
meta_df = pd.read_csv('train_metadata.csv')
meta_df.head()

In [None]:
AUDIO_CLIPS = glob.glob(f"{SAVE_DIR}*/*.ogg")
print(f'Number of audio clips: {len(AUDIO_CLIPS)}')
AUDIO_CLIPS[0]

In [None]:
procs = 12
print(procs)
procIDs = list(range(0, procs))
# grab the paths to the input images, then determine the number
# of images each process will handle
numImagesPerProc = len(AUDIO_CLIPS) / float(procs)
numImagesPerProc = int(np.ceil(numImagesPerProc))
# chunk the image paths into N (approximately) equal sets, one
# set of image paths for each individual process
chunkedPaths = list(chunk(AUDIO_CLIPS, numImagesPerProc))

In [None]:
os.makedirs('train_clips/', exist_ok=True)

In [None]:
def prepare_train_df(audio_paths):
    # create local pandas dataframe
    train_df = pd.DataFrame(columns=meta_df.columns)

    # Iterate over individual audio paths.
    for i, audio_path in tqdm(enumerate(audio_paths)):
        filename = audio_path.split('/')[-1].split('.')[0].split('_')[0]
        audio_clip_name = audio_path.split('/')[-1].split('.')[0]
        row = meta_df.loc[meta_df['filename']==filename+'.ogg'].replace(f'{filename}.ogg', f'{audio_clip_name}.ogg')
        train_df = train_df.append(row, ignore_index=True)      
        
    pid = os.getpid()
    train_df.to_csv(f'train_clips/train_{pid}.csv')

In [None]:
print("[INFO] launching pool using {} processes...".format(procs))
pool = Pool(processes=procs)
pool.imap(prepare_train_df, chunkedPaths)

In [None]:
chunked_files = glob.glob(f"train_clips/*.csv")
print(f'Number of chunks: {len(chunked_files)}')

In [None]:
df_arr = []
c = 0
for chunked_file in chunked_files:
    tmp_df = pd.read_csv(chunked_file)
    tmp_df = tmp_df[tmp_df.columns[1:]]
    df_arr.append(tmp_df)
    
train_df = pd.concat(df_arr)

In [None]:
# Reference: https://www.kaggle.com/shahules/bird-watch-complete-eda-fe
# Unique eBird codes
species = train_df['primary_label'].value_counts()

# Make bar chart
fig = go.Figure(data=[go.Bar(y=species.values, x=species.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))

# Show chart
fig.update_layout(title='Number of traning samples per species')
fig.show()

# Save as W&B Artifact

In [None]:
train_df.to_csv('train_clips.csv', index=False)

run = wandb.init(project='birdclef', group='Dataset Creation')

raw_dataset = run.use_artifact('ayush-thakur/birdclef/train-metadata:v0', type='dataset')

artifact = wandb.Artifact('audio_clips_5sec', type='dataset')
artifact.add_file('train_clips.csv')
run.log_artifact(artifact)
run.finish()

In [None]:
!python --version
import platform
print(platform.platform())
print("cpu cores: {0}".format(cpu_count()))

> This notebook was run on a GCP instance with 16 CPU cores. I hope you find this kernel useful for your own experiments.