Prototype of merging an Excel file plus audio into a dataframe

In [1]:
import os
import re
import pandas as p
from pydub import AudioSegment

excelfile = 'data/91132_R1_With time.xlsx'
audios_dir = '91132_R1/'
audio_files = os.listdir(audios_dir)

df = p.read_excel(excelfile,index_col=None,header=0)




In [2]:
def find_file(filenames, begin, end):
    """
    filenames: list of filenames
    begin: first number to look for in filename
    end: second number to look for in filename
    return: the first filename that contains both begin and end
    """
    filenames = sorted(filenames)
    pattern = re.compile(r'.(\d+)-.(\d+)') # TODO only works with current filename formatting
    b_str = str(begin)
    e_str = str(end)
    for name in filenames:
        m = pattern.match(name)
        if m is None:
            continue
        if b_str == m.group(1) and e_str == m.group(2):
            return name
    raise ValueError('file not found')

def get_begin_end(timepoint):
    """
    timepoint: string from timepoint column
    return: pair of ints (first timepoint in seconds, second timepoint in seconds)
    """
    times = timepoint.split('-')
    assert len(times) == 2
    begintime = times[0]
    endtime = times[1]
    
    begintime_split = begintime.split(':')
    assert len(begintime_split) == 2
    begin = 60 * int(begintime_split[0]) + int(begintime_split[1])
    
    endtime_split = endtime.split(':')
    assert len(endtime_split) == 2
    end = 60 * int(endtime_split[0]) + int(endtime_split[1])
    
    return begin, end

In [3]:
# Load file for each row, create new column, add that column
audios = []
debug = True
for tp in df['Timepoint']:
    b, e = get_begin_end(tp)
    fname = find_file(audio_files, b, e)
    if debug:
        print('tp:', tp, '\tfname:', fname, '\tb:', b, '\te:', e)
    audio = AudioSegment.from_wav(os.path.join(audios_dir, fname))
    # NOTE: if file fails to load, pydub handles it by truncating audio data
    print('raw bytes loaded:', audio.raw_data)
    audios.append(audio)
df['Audio Clip'] = audios
print('done')

tp: 2:00-2:20 	fname: 120-140 	b: 120 	e: 140
raw bytes loaded: b''
tp: 2:20-2:40 	fname: 140-160 	b: 140 	e: 160
raw bytes loaded: b''
tp: 2:40-3:00 	fname: 160-180 	b: 160 	e: 180
raw bytes loaded: b''
tp: 3:00 - 3:20 	fname: 180-200 	b: 180 	e: 200
raw bytes loaded: b''
tp: 3:20-3:40 	fname: 200-220 	b: 200 	e: 220
raw bytes loaded: b''
tp: 3:40-4:00 	fname: 220-240 	b: 220 	e: 240
raw bytes loaded: b''
tp: 4:00 - 4:20 	fname: 240-260 	b: 240 	e: 260
raw bytes loaded: b''
tp: 4:20-4:40 	fname: 260-280 	b: 260 	e: 280
raw bytes loaded: b''
tp: 4:40-5:00 	fname: 280-300 	b: 280 	e: 300
raw bytes loaded: b''
tp: 5:00 - 5:20 	fname: 300-320 	b: 300 	e: 320
raw bytes loaded: b''
tp: 5:20-5:40 	fname: 320-340 	b: 320 	e: 340
raw bytes loaded: b''
tp: 5:40-6:00 	fname: 340-360 	b: 340 	e: 360
raw bytes loaded: b''
tp: 6:00 - 6:20 	fname: 360-380 	b: 360 	e: 380
raw bytes loaded: b''
tp: 6:20-6:40 	fname: 380-400 	b: 380 	e: 400
raw bytes loaded: b''
tp: 6:40-7:0