# Final merging

In [None]:
annofolder = curfolder + '\MT_annotated\\'
annofiles = glob.glob(annofolder + '*ELAN_tiers.csv')
mergedfiles = glob.glob(TSmerged + '/merged*.csv')
mergedfiles = [x for x in mergedfiles if 'anno' not in x]

In [None]:
# function to parse anno from ELAN

## add to it also vocalization tier

import xml.etree.ElementTree as ET

def parse_eaf_file(eaf_file, rel_tiers):
    tree = ET.parse(eaf_file)
    root = tree.getroot()

    time_order = root.find('TIME_ORDER')
    time_slots = {time_slot.attrib['TIME_SLOT_ID']: time_slot.attrib['TIME_VALUE'] for time_slot in time_order}

    annotations = []
    relevant_tiers = {rel_tiers}
    for tier in root.findall('TIER'):
        tier_id = tier.attrib['TIER_ID']
        if tier_id in relevant_tiers:
            for annotation in tier.findall('ANNOTATION/ALIGNABLE_ANNOTATION'):
                # Ensure required attributes are present
                if 'TIME_SLOT_REF1' in annotation.attrib and 'TIME_SLOT_REF2' in annotation.attrib:
                    ts_ref1 = annotation.attrib['TIME_SLOT_REF1']
                    ts_ref2 = annotation.attrib['TIME_SLOT_REF2']
                    # Get annotation ID if it exists, otherwise set to None
                    ann_id = annotation.attrib.get('ANNOTATION_ID', None)
                    annotation_value = annotation.find('ANNOTATION_VALUE').text.strip()
                    annotations.append({
                        'tier_id': tier_id,
                        'annotation_id': ann_id,
                        'start_time': time_slots[ts_ref1],
                        'end_time': time_slots[ts_ref2],
                        'annotation_value': annotation_value
                    })

    return annotations

# Vocalizations from ELAN

In [None]:
manualannofiles = glob.glob(curfolder + '/ManualAnno/R1/*ELAN_tiers.eaf') # Ola's

# txt file to write the annotations
vocal_anno = curfolder + '/ManualAnno/vocalization_annotations.txt'


with open(vocal_anno, 'w') as f:
    for file in manualannofiles:
        print('working on ' + file)
        # get the filename as the last element
        filename = file.split('\\')[-1]
        # replace _ELAN_tiers.eaf with ''
        filename = filename.replace('_ELAN_tiers.eaf', '')
        # parse the file
        annotations = parse_eaf_file(file, 'vocalization')
        # write the annotations
        for annotation in annotations:
            f.write(f"{annotation['start_time']}\t{annotation['end_time']}\t{annotation['annotation_value']}\t{filename}\n")

# Merge

In [None]:

voc_anno = curfolder + '\\ManualAnno\\vocalization_annotations.txt'

for file in mergedfiles:
    print('working on ' + file)

    # get trialid
    trialid = file.split('\\')[-1].split('.')[0]
    # replace merged_ with ''
    trialid = trialid.replace('merged_', '')
    
    # find in annofiles the one with the same trialid
    try:
        annofile = [x for x in annofiles if trialid in x][0]
    except IndexError:
        print('IndexError: ' + trialid + 'not found')
        continue

    # load the merged file
    merged = pd.read_csv(file)
    # load the annotation file
    anno = pd.read_csv(annofile)
    # rename Time to time
    anno.rename(columns={'Time': 'time'}, inplace=True)

    ### voc_ano
    voc_df = pd.read_csv(voc_anno, sep='\t', header=None)

    # get the annotations for the trialid
    voc_anno_trial = voc_df[voc_df[3] == trialid]

    if voc_anno_trial.empty:
        print('no vocalization annotations for ' + trialid)
    else:
        # create a new column for vocalization
        merged['vocalization'] = 0
        # get the start and end times
        for index, row in voc_anno_trial.iterrows():
            start = row[0]
            end = row[1]
            merged.loc[(merged['time'] >= start) & (merged['time'] <= end), 'vocalization'] = row[2]

    # merge the two dataframes
    merged_anno = pd.merge(merged, anno, on=['time', 'TrialID'], how='outer')

    # interpolate missing values of anno columns
    colstoint = anno.columns
    colstoint = [x for x in colstoint if 'time' not in x]
    colstoint = [x for x in colstoint if 'TrialID' not in x]

    for col in colstoint:
        # fill in missing values
        merged_anno[col] = merged_anno[col].fillna(method='ffill')

    # get rid of missing values in COPc
    merged_anno = merged_anno[~np.isnan(merged_anno['COPc'])]

    # write to csv 
    merged_anno.to_csv(TSmerged + '/merged_anno_' + trialid + '.csv', index=False)
