In [7]:
import pandas as pd
import re

In [22]:
#Function to filter through the "Comments" column since the StartDateTime column is unreliable
def extract_time(text):
    pattern = r'\d{2}:\d{2}:\d{2}'
    match = re.search(pattern, text)
    
    if match:
        return match.group()
    else:
        return None


def random_stratified_sample(path):
    # Import csv into a pd dataframe 
    data = pd.read_csv(path, low_memory = False)
    
    # Filter out all invalid clips
    data = data[data['Duration'] >= 60] 
    
    # Add a time of day column, representing only the hour of the day, filtering out of the "Comments" column
    data['StartDateTime'] = data['Comment'].apply(extract_time).astype(str)
    data['StartDateTime'] = pd.to_datetime(data['StartDateTime'], format = '%H:%M:%S')
    data['Time of Day'] = data['StartDateTime'].dt.strftime('%H')
    
    grouped_data = data.groupby(['Time of Day', 'AudioMothCode'])
    stratified_sample = pd.DataFrame()
    
    # Random stratified selection
    for name, group in grouped_data:
        selected_sample = group.sample(n=1)
        stratified_sample = pd.concat([stratified_sample, selected_sample])
    grouped_stratified_sample = stratified_sample.groupby('AudioMothCode')
    random_stratified_sample = pd.DataFrame(columns=stratified_sample.columns)
    
    # Make sure each sample has all 24 hours of the day
    for name, group in grouped_stratified_sample:
        if(len(group) == 24):
            random_stratified_sample = pd.concat([random_stratified_sample, group], ignore_index = True)
    
    # Export to csv
    random_stratified_sample.to_csv("stratified_random_sample.csv", index=False)

In [23]:
# Test
random_stratified_sample('Peru_2019_AudioMoth_Data_Full.csv')

  random_stratified_sample = pd.concat([random_stratified_sample, group], ignore_index = True)
