In [1]:
#Russell's code for extracting the data from dataset
file_path = 'USA5288_decoded.csv'

import pprint
import re
from datetime import datetime
import csv
import json
import numpy as np
#import copy

def get_ordered_syllable_for_song(song_syllable_onsets_offsets_ms):
    """Using syllable_onsets_offsets_ms dictionary return an ordered list of tuples (syllable_label, onset, offset)

    syllable_onsets_offsets_ms (dict)
        key: syllable_label
        value: list of tuples (onset time, offset time)

    Alas python dictionaries are not ordered, so we can't rely on the order of the keys.
    """
    raw_syllable_tuples = []
    for syllable_label, times in song_syllable_onsets_offsets_ms.items():
        for start, end in times:
            raw_syllable_tuples.append((syllable_label, start, end))

    sorted_syllable_tuples = sorted(raw_syllable_tuples, key=lambda x: x[1])
    return sorted_syllable_tuples



def get_recording_time_from_filename(recording_file_path_name):
    """Function to extract animal_id and convert date/time to a datetime object using named groups"""
    try:
        # Define the regex pattern with named groups for animal_id, month, day, hour, minute, and second
        pattern = r'(?P<animal_id>[\w\d]+)_\d+\.\d+_(?P<month>\d+)_(?P<day>\d+)_(?P<hour>\d+)_(?P<minute>\d+)_(?P<second>\d+)\.wav$'

        # Search for the pattern in the file path
        match = re.search(pattern, recording_file_path_name)

        if match:
            # Use the named groups to extract the values
            animal_id = match.group('animal_id')
            month = match.group('month').zfill(2)
            day = match.group('day').zfill(2)
            hour = match.group('hour').zfill(2)
            minute = match.group('minute').zfill(2)
            second = match.group('second').zfill(2)

            # Construct a datetime object (assuming the year is 2024 for this example)
            date_time_str = f"2024-{month}-{day} {hour}:{minute}:{second}"
            date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')

            return animal_id, date_time_obj
        else:
            return None, None  # Return None if no match is found
    except Exception as e:
        print(f"Error: {e}")
        return None, None


def load_single_bird_syllable_csv(file_path):
    """"""
    def unescape_and_eval(v):
        if v.startswith("''"):
            v = v.replace("''", "")
        if v.startswith("'"):
            v = v.replace("'", "")

        return eval(v)

    results = []
    with open(file_path, 'r') as f:
        reader = csv.DictReader(f)

        for row in reader:
            syllable_onsets_offsets_timebins = unescape_and_eval(row['syllable_onsets_offsets_timebins'])
            syllable_onsets_offsets_ms = unescape_and_eval(row['syllable_onsets_offsets_ms'])

            ordered_and_timed_syllables = get_ordered_syllable_for_song(syllable_onsets_offsets_ms)
            animal_id, recording_time = get_recording_time_from_filename(row['file_name'])

            data = {
                "file_name": row['file_name'],
                "song_present": row['song_present'],
                #'syllable_onsets_offsets_timebins': syllable_onsets_offsets_timebins,
                #'syllable_onsets_offsets_ms': syllable_onsets_offsets_ms,

                'animal_id': animal_id,
                'recording_time': recording_time,

                'ordered_and_timed_syllables': ordered_and_timed_syllables
            }

            results.append(data)

    return results

def split_dataset_by_surgery_date(results, surgery_date):
    """Split the dataset into two groups: pre-surgery and post-surgery based on the date of surgery"""
    results_pre_surgery = []
    results_post_surgery = []

    for result in results:
        recording_date = result['recording_time']

        if recording_date < surgery_date:
            results_pre_surgery.append(result)
        elif recording_date == surgery_date:
            raise ValueError("Recording date is the same as the surgery date")
        else:
            results_post_surgery.append(result)

    return results_pre_surgery, results_post_surgery

results = load_single_bird_syllable_csv(file_path)
print(len(results))
#pprint.pprint(results[1])

52151


In [2]:
#Russell's code for splitting the dataset in pre and post-lesion data
import json
from datetime import datetime
json_file_path = 'USA5288_creation_data.json'

with open(json_file_path, 'r') as f:
    data = json.load(f)

    surgery_treatment_date = datetime.strptime(
        data['treatment_date'], '%Y-%m-%d')


results_pre_surgery, results_post_surgery = split_dataset_by_surgery_date(results, surgery_treatment_date)
surgery_treatment_date, len(results_pre_surgery), len(results_post_surgery)

(datetime.datetime(2024, 4, 9, 0, 0), 33374, 18777)

In [3]:
#Establishes a set of symbols which is later used for reference
presurgery_syllables = []
for result in results_pre_surgery:
    ordered_syllables = [
        str(syl_data[0])
        for syl_data in result['ordered_and_timed_syllables']
    ]

    if len(ordered_syllables) == 0:
        continue
    for i in ordered_syllables:
        if i not in presurgery_syllables:
            presurgery_syllables.append(i)


#print(len(presurgery_syllables))
##These are all of the symbols in the dataset
#print(presurgery_syllables)

In [4]:
#Collects the durations (the differences between the end of the syllable 
#  and the beginning of the syllable) for each instance of the particular 
#  syllables to later analyze and generate a condition array (see comparisonlist below)
diffslist = [[] for i in presurgery_syllables]
##############################################################################PRE
#a modified data repository to track durations and later
#  to insert the 2D symbols in place of the 1D symbols
symboldiffs_PRE_surgery = [[] for result in results_pre_surgery]
#tracker keeps track of index of nested lists
tracker = 0
for result in results_pre_surgery:

    for syl_data in result['ordered_and_timed_syllables']:

        diff = syl_data[2]-syl_data[1]

        #establishes the values used to measure establish the conditions
        #  This should only be associated with pre surgery
        diffslist[presurgery_syllables.index(str(syl_data[0]))].append(diff)
    
        symboldiffs_PRE_surgery[tracker].append(str(syl_data[0]))
        symboldiffs_PRE_surgery[tracker].append(diff)
    tracker +=1
##############################################################################POST
#a modified data repository to track diffs and later
#  to insert the 2D symbols in place of the 1D symbols
symboldiffs_POST_surgery = [[] for result in results_post_surgery]
tracker = 0
for result in results_post_surgery:

    for syl_data in result['ordered_and_timed_syllables']:

        diff = syl_data[2]-syl_data[1]

    
        symboldiffs_POST_surgery[tracker].append(str(syl_data[0]))
        symboldiffs_POST_surgery[tracker].append(diff)
    tracker +=1

In [5]:

#generates the conditions array
comparisonlist = [[] for i in presurgery_syllables]

#all of this should be from the pre-surgery data exclusively
for i in range(len(diffslist)):
    stuff = np.array(diffslist[i])
    #establishes the conditions
    #minimum duration pre surgery
    getmin = np.min(stuff)
    #maximum duration pre surgery
    getmax = np.max(stuff)
    #average duration pre surgery
    getavg = np.average(stuff)
    #standard deviation of pre surgery duration 
    getstddev = np.std(stuff)
    #1st quartile duration pre surgery
    get1stqtl = np.percentile(stuff,25)
    #3rd quartile duration pre surgery
    get3rdqtl = np.percentile(stuff,75)
    
    #puts the conditions in an indexable format
    comparisonlist[i].append(getmin)
    comparisonlist[i].append(getmax)
    comparisonlist[i].append(getavg)
    comparisonlist[i].append(getstddev)
    comparisonlist[i].append(get1stqtl)
    comparisonlist[i].append(get3rdqtl)
    
#This is the list containing the min,max,avg, standard deviation,
#  1st quartile, and 3rd quartile (bracketed numbers) for each symbol
#  Each symbol (leftmost number) is identified based on the order of
#  the presence in presurgery_syllables so, the values for '8' are in
#  presurgery_syllables[0] while the values for '21' are in presurgery_syllables[1] and so on. 
for i in range(len(comparisonlist)):
    print(presurgery_syllables[i],comparisonlist[i])

8 [2.69841269841163, 3605.0793650793657, 1673.3337047767627, 689.6540927839395, 1400.4761904761906, 2174.920634920635]
21 [2.6984126984125396, 4665.555555555556, 1616.4274536105524, 495.92642771812484, 1373.4920634920636, 1905.0793650793655]
22 [2.6984126984125396, 2377.301587301588, 1238.8155907188668, 474.5012116349894, 1073.968253968254, 1513.8095238095239]
26 [2.69841269841163, 396.6666666666667, 57.93656095259302, 35.15571391670408, 29.682539682539698, 80.95238095238165]
23 [2.6984126984125396, 949.8412698412703, 238.80135151839795, 127.77362766917703, 121.42857142857133, 277.93650793650795]
2 [2.6984126984125396, 1616.3492063492063, 954.4631866351984, 178.6455341734946, 879.6825396825398, 1052.3809523809525]
3 [2.69841269841163, 4322.857142857143, 1675.8125309158943, 697.8454331567955, 1462.539682539682, 2120.9523809523807]
5 [2.69841269841163, 2058.8888888888887, 804.6181820446238, 262.83586483711434, 666.5079365079364, 960.634920634921]
11 [2.69841269841163, 2973.6507936507933,

In [6]:
#GENERATES CSV files containing 2D syllables appropriate for each dataset

for k in symboldiffs_PRE_surgery:
    if len(k)==0:
        continue
    for i in range(len(k)):
        #makes sure to not use the difference as a symbol
        if type(k[i]) == float:
            continue
        thesymbol = str(k[i])
        comparisonindex = presurgery_syllables.index(thesymbol)
        symbolmin = comparisonlist[comparisonindex][0]
        symbolmax = comparisonlist[comparisonindex][1]
        symbol1st = comparisonlist[comparisonindex][4]
        symbol3rd = comparisonlist[comparisonindex][5]
        
        #defines the extra short symbol ('es' added)
        #  intended to capture the abnormally short symbol lengths post-lesion
        #  NOTE THAT THERE SHOULD NOT BE ANY es SYMBOLS PRE-SURGERY
        if k[i+1] < symbolmin:
            k[i]= '({},es)'.format(thesymbol)
        
        #defines the short symbol ('s' added)
        if symbolmin <= k[i+1] < symbol1st:
            k[i]= '({},s)'.format(thesymbol)
        
        #defines the long symbol ('l' added)
        elif (symbol3rd < k[i+1] <= symbolmax):
            k[i]= '({},l)'.format(thesymbol)

        #defines the extra long symbol ('el' added)
        #  intended to capture the abnormally long symbol lengths post-lesion
        #  NOTE THAT THERE SHOULD NOT BE ANY el SYMBOLS PRE-SURGERY
        elif k[i+1] > symbolmax:
            k[i]= '({},el)'.format(thesymbol)

        #defines the normal symbol ('n' added)
        else:
            k[i]= '({},n)'.format(thesymbol)

#Now we prepare to compare the PRE, POST and Original syllable sets
#twoD_syllables_PRE = []
presurgerysymbols_outputprep = []
for k in symboldiffs_PRE_surgery:
    if len(k)==0:
        continue
    songlist = []    
    for i in range(len(k)):
        #makes sure to not use the difference as a symbol
        if type(k[i]) == float:
            continue
        songlist.append(k[i])

    presurgerysymbols_outputprep.append(songlist)

#establishes first-half and second-half pre surgery songs
#songnumber is the number of songs divided by 2.
#  This may yield a float which will cause problems
#  In order to correct this, we round up
songnumber = len(presurgerysymbols_outputprep)/2
#ensures songnumber is rounded up for consistency
songnumber = int(songnumber+(songnumber % 2 > 0))



#outputs the first-half pre-surgery
fhpsfilename = 'Presurgery-songs_first-half_2D-syllables'
with open('{}.csv'.format(fhpsfilename), 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    # write the header
    #writer.writerow(header)
    for k in range(songnumber):
        writer.writerow(presurgerysymbols_outputprep[k])
        
#outputs the first-half pre-surgery
shpsfilename = 'Presurgery-songs_second-half_2D-syllables'
with open('{}.csv'.format(shpsfilename), 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    for k in range(len(presurgerysymbols_outputprep)-songnumber):
        writer.writerow(presurgerysymbols_outputprep[k+songnumber])
#######################################################################################
#######################################################################################

for k in symboldiffs_POST_surgery:
    if len(k)==0:
        continue
    for i in range(len(k)):
        #makes sure to not use the difference as a symbol
        if type(k[i]) == float:
            continue
        thesymbol = str(k[i])
        comparisonindex = presurgery_syllables.index(thesymbol)
        symbolmin = comparisonlist[comparisonindex][0]
        symbolmax = comparisonlist[comparisonindex][1]
        symbol1st = comparisonlist[comparisonindex][4]
        symbol3rd = comparisonlist[comparisonindex][5]
        
        #defines the extra short symbol ('es' added)
        #  intended to capture the abnormally short symbol lengths post-lesion
        if k[i+1] < symbolmin:
            k[i]= '({},es)'.format(thesymbol)
        
        #defines the short symbol ('s' added)
        if symbolmin <= k[i+1] < symbol1st:
            k[i]= '({},s)'.format(thesymbol)

        #defines the long symbol ('l' added)
        elif (symbol3rd < k[i+1] <= symbolmax):
            k[i]= '({},l)'.format(thesymbol)

        #defines the extra long symbol ('el' added)
        #  intended to capture the abnormally long symbol lengths post-lesion
        elif k[i+1] > symbolmax:
            k[i]= '({},el)'.format(thesymbol)

        #defines the normal symbol ('n' added)
        else:
            k[i]= '({},n)'.format(thesymbol)

#generates a csv with the post surgery songs
postsurgerysymbols_outputprep = []
for k in symboldiffs_POST_surgery:
    if len(k)==0:
        continue
    #establishes a list to serve as a song to gather symbols into
    songlist = []    
    for i in range(len(k)):
        #makes sure to not use the difference as a symbol
        if type(k[i]) == float:
            continue
        #adds the symbol to the song
        songlist.append(k[i])
    #adds the song to the output list
    postsurgerysymbols_outputprep.append(songlist)            
#outputs the post-surgery 2D songs
psfilename = 'Postsurgery-songs_2D-syllables'
with open('{}.csv'.format(psfilename), 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    for k in postsurgerysymbols_outputprep:
        writer.writerow(k)