In [1]:
import uwacan
import numpy as np
import xarray as xr
import pendulum

In [4]:
recorder = uwacan.recordings.SoundTrap.read_folder(r"D:\Tern Island Vinga\Autonomous Recorders\IVL SoundTrap 1")
# recorder = recorder.subwindow(center="2022-05-17 15:59:47", duration=120)

In [11]:
self = recorder
ref_offset = 25
ref_time = self.time_window.start.add(seconds=ref_offset)
start_offset = -4
samples_to_read = int(self.samplerate * 20)
print(f"First sample should be {ref_offset + start_offset / self.samplerate}")
print(f"Last sample should be {ref_offset + (start_offset + samples_to_read - 1) / self.samplerate}")

# ref_time = self.time_window.start.subtract(seconds=10)
# start_offset = int(self.samplerate * 10)
# samples_to_read = int(self.samplerate * self.time_window.duration)
# def raw_data(self, ref_time=None, start_offset=None, samples_to_read=None):

samplerate = self.samplerate
# The files typically read the start time from the file name, but the stop time
# is only known after opening the file. This means we want to use the start time
# for the initial check, not the stop time.
# As such, we are looking for the last file with a start time before the ref time.
print(f"Looking for ref time {ref_time}")
for file in reversed(self.files):
    if file.start_time <= ref_time:
        break
else:
    raise ValueError(f"Cannot refer samples from {ref_time}, earliest file start is {file.start_time}")

print(f"Found ref time in file covering {file.start_time} - {file.stop_time}")

start_idx = int(np.floor((ref_time - file.start_time).total_seconds() * samplerate))
# The index in this file where the ref time is.
file_idx = self.files.index(file)

# Looking for the correct place to start reading.
# After this loop, reading will start at self.files[file_idx][start_idx]
# start_offset is how many samples earlier or later than the current (file_idx, start_idx) we should start.
if start_offset > 0:
    # We should look forwards to find the first file
    while start_offset != 0:
        if self.files[file_idx].num_samples <= start_offset + start_idx:
            # Current start index + the remaining offset takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # Remove the remainder of the file from the start offset, point to the beginning of the next file.
            start_offset -= self.files[file_idx].num_samples - start_idx
            file_idx += 1
            start_idx = 0
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += start_offset
            start_offset = 0
        if file_idx == len(self.files):
            raise ValueError(f"Cannot read from this late")
elif start_offset < 0:
    # We should look backwards to find the first file
    while start_offset != 0:
        if start_idx + start_offset < 0:
            # Current start index + remaining offset (which is negative) takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # We want to point to the last sample in the previous file, which is start_idx + 1 earlier
            start_offset += start_idx + 1
            file_idx -= 1
            start_idx = self.files[file_idx].num_samples - 1
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += start_offset
            start_offset = 0
        if file_idx == -1:
            raise ValueError("Cannot read from this early")


# Step ahead by start_offset samples
# start_idx = int(np.floor((ref_time - file.start_time).total_seconds() * samplerate))
# files = iter(self.files[self.files.index(file) :])
# file = next(files)
# while start_offset != 0:
# # for file in files:
#     if file.num_samples - start_idx <= start_offset:
#         # We should skip this file
#         print(f"Skipping file covering {file.start_time} - {file.stop_time}")
#         start_offset -= file.num_samples - start_idx
#         file = next(files)
#         start_idx = 0
#     else:
#         # We are reading from this file
#         print(f"First file to read is {file.start_time} - {file.stop_time}")
#         start_idx += start_offset
#         start_offset = 0
        # break
# else:
    # raise ValueError("Couldn't find first file to read")

files = iter(self.files[file_idx:])
file = next(files)
chunks = []
previous_end = file.start_time
while samples_to_read != 0:
# for file in files
    # We will read from `file`, so it should be reasonably consecutive with the previous file
    interrupt = (file.start_time - previous_end).total_seconds()
    if interrupt > self.allowable_interrupt:
        raise ValueError(
            f"Data is not continuous, missing {interrupt} seconds before file {file.filepath}, "
            f"starting at {file.start_time} and the previous ending at {previous_end}"
        )
    previous_end = file.stop_time
    if file.num_samples - start_idx < samples_to_read:
        # This is not the last file to read
        print(f"Reading file covering {file.start_time} - {file.stop_time}, starting with sample {start_idx}")
        chunk = file.read_data(start_idx=start_idx)
        chunks.append(chunk)
        samples_to_read -= chunk.shape[-1]
        start_idx = 0
        file = next(files)
    else:
        # This is the last file to read
        print(f"Reading file covering {file.start_time} - {file.stop_time}, from {start_idx} to {start_idx + samples_to_read}")
        chunk = file.read_data(start_idx=start_idx, stop_idx=start_idx + samples_to_read)
        chunks.append(chunk)
        samples_to_read -= chunk.shape[-1]
        # break
# else:
    # raise ValueError("Could not read enough data")

data = np.concatenate(chunks, axis=-1)
display(data)
display(data.shape)
# file = next(files)
# file_index = self.files.index(file)


First sample should be 23.0
Last sample should be 42.5
Looking for ref time 2024-01-01 12:00:25+00:00
Found ref time in file covering 2024-01-01 12:00:22+00:00 - 2024-01-01 12:00:32+00:00
First file to read is 2024-01-01 12:00:22+00:00 - 2024-01-01 12:00:32+00:00
Reading file covering 2024-01-01 12:00:22+00:00 - 2024-01-01 12:00:32+00:00, starting with sample 2
Reading file covering 2024-01-01 12:00:33+00:00 - 2024-01-01 12:00:43+00:00, starting with sample 0
Reading file covering 2024-01-01 12:00:44+00:00 - 2024-01-01 12:00:54+00:00, from 0 to 2


array([23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. , 27.5, 28. ,
       28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 33. , 33.5, 34. , 34.5,
       35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. , 39.5, 40. ,
       40.5, 41. , 41.5, 42. , 42.5, 44. , 44.5])

(40,)

Overcomplicated to first collect files, then check continuity, then load data.

In [199]:
self = recorder
ref_offset = 2
ref_time = self.time_window.start.add(seconds=ref_offset)
start_offset = 5
samples_to_read = int(self.samplerate * 30)
print(f"First sample should be {ref_offset + start_offset / self.samplerate}")
print(f"Last sample should be {ref_offset + (start_offset + samples_to_read - 1) / self.samplerate}")

# ref_time = self.time_window.start.subtract(seconds=10)
# start_offset = int(self.samplerate * 10)
# samples_to_read = int(self.samplerate * self.time_window.duration)
# def raw_data(self, ref_time=None, start_offset=None, samples_to_read=None):

samplerate = self.samplerate
# The files typically read the start time from the file name, but the stop time
# is only known after opening the file. This means we want to use the start time
# for the initial check, not the stop time.
# As such, we are looking for the last file with a start time before the ref time.
# print(f"Looking for ref time {ref_time}")
# for file in reversed(self.files):
#     if file.start_time <= ref_time:
#         break
# else:
#     raise ValueError(f"Cannot refer samples to {ref_time}, earliest file start is {file.start_time}")

for file_idx in reversed(range(len(self.files))):
    if self.files[file_idx].start_time <= ref_time:
        if self.files[file_idx].stop_time < ref_time:
            raise ValueError(
                f"Cannot refer samples to {ref_time}, latest file starting before ends before\n"
                f"{self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}\n"
            )
        break
else:
    raise ValueError(f"Cannot refer samples to {ref_time}, earliest file start is {self.files[file_idx].start_time}")


print(f"Found ref time in file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")

def check_file_continuity(early, late):
    interrupt = (late.start_time - early.stop_time).total_seconds()
    if interrupt > self.allowable_interrupt:
        raise ValueError(
            f"Data is not continuous, missing {interrupt} seconds between files\n "
            f"{early.filepath} ending at {early.stop_time}\n"
            f"{late.filepath} starting at {late.start_time}"
        )


start_idx = int(np.floor((ref_time - self.files[file_idx].start_time).total_seconds() * samplerate))
# The index in this file where the ref time is.
# file_idx = self.files.index(file)

# Looking for the correct place to start reading.
# After this loop, reading will start at self.files[file_idx][start_idx]
# start_offset is how many samples earlier or later than the current (file_idx, start_idx) we should start.
if start_offset > 0:
    remaining_offset = start_offset
    # We should look forwards to find the first file
    while remaining_offset != 0:
        if self.files[file_idx].num_samples <= remaining_offset + start_idx:
            # Current start index + the remaining offset takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # Remove the remainder of the file from the start offset, point to the beginning of the next file.
            if file_idx == len(self.files) - 1:
                raise ValueError(f"Cannot read from this late")
            check_file_continuity(self.files[file_idx], self.files[file_idx + 1])
            remaining_offset -= self.files[file_idx].num_samples - start_idx
            file_idx += 1
            start_idx = 0
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0
elif start_offset < 0:
    remaining_offset = start_offset
    # We should look backwards to find the first file
    while remaining_offset != 0:
        if start_idx + remaining_offset < 0:
            # Current start index + remaining offset (which is negative) takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # We want to point to the last sample in the previous file, which is start_idx + 1 earlier
            if file_idx == 0:
                raise ValueError("Cannot read from this early")
            check_file_continuity(self.files[file_idx - 1], self.files[file_idx])
            remaining_offset += start_idx + 1
            file_idx -= 1
            start_idx = self.files[file_idx].num_samples - 1
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0

if self.files[file_idx].num_samples - start_idx > samples_to_read:
    # The data exists within this file
    data = self.files[file_idx].read_data(start_idx, start_idx + samples_to_read)
else:
    files_to_read = [self.files[file_idx]]
    samples_prepared = self.files[file_idx].num_samples - start_idx
    print(f"Preparing {samples_prepared} samples from first file {self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}")
    file_idx += 1
    while samples_prepared < samples_to_read:
        if self.files[file_idx].num_samples + samples_prepared < samples_to_read:
            # This entire file should be read
            files_to_read.append(self.files[file_idx])
            samples_prepared += self.files[file_idx].num_samples
            print(f"Preparing {self.files[file_idx].num_samples} samples from file {self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}")
            file_idx += 1
            if file_idx == len(self.files):
                raise ValueError("Cannot read this late")
        else:
            stop_idx = samples_to_read - samples_prepared
            samples_prepared += stop_idx
            files_to_read.append(self.files[file_idx])
            print(f"Preparing {stop_idx} samples from last file {self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}")

    for early, late in zip(files_to_read[:-1], files_to_read[1:]):
        check_file_continuity(early, late)

    chunks = []
    chunk = files_to_read[0].read_data(start_idx=start_idx)
    print(f"Read {chunk.shape[-1]} samples from first file {files_to_read[0].start_time} -> {files_to_read[0].stop_time}")
    samples_to_read -= chunk.shape[-1]
    chunks.append(chunk)
    for file in files_to_read[1:-1]:
        chunk = file.read_data()
        print(f"Read {chunk.shape[-1]} samples from file {file.start_time} -> {file.stop_time}")
        samples_to_read -= chunk.shape[-1]
        chunks.append(chunk)
    chunk = files_to_read[-1].read_data(stop_idx=stop_idx)
    print(f"Read {chunk.shape[-1]} samples from last file {files_to_read[-1].start_time} -> {files_to_read[-1].stop_time}")
    samples_to_read -= chunk.shape[-1]
    chunks.append(chunk)

    assert samples_to_read == 0

    data = np.concatenate(chunks, axis=-1)


display(data)
display(data.shape)


First sample should be 4.5
Last sample should be 34.0
Found ref time in file covering 2024-01-01 12:00:00+00:00 - 2024-01-01 12:00:10+00:00
First file to read is 2024-01-01 12:00:00+00:00 - 2024-01-01 12:00:10+00:00
Preparing 11 samples from first file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Preparing 20 samples from file 2024-01-01 12:00:10+00:00 -> 2024-01-01 12:00:20+00:00
Preparing 20 samples from file 2024-01-01 12:00:20+00:00 -> 2024-01-01 12:00:30+00:00
Preparing 9 samples from last file 2024-01-01 12:00:30+00:00 -> 2024-01-01 12:00:40+00:00
Read 11 samples from first file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 20 samples from file 2024-01-01 12:00:10+00:00 -> 2024-01-01 12:00:20+00:00
Read 20 samples from file 2024-01-01 12:00:20+00:00 -> 2024-01-01 12:00:30+00:00
Read 9 samples from last file 2024-01-01 12:00:30+00:00 -> 2024-01-01 12:00:40+00:00


array([ 4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5,
       10. , 10.5, 11. , 11.5, 12. , 12.5, 13. , 13.5, 14. , 14.5, 15. ,
       15.5, 16. , 16.5, 17. , 17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5,
       21. , 21.5, 22. , 22.5, 23. , 23.5, 24. , 24.5, 25. , 25.5, 26. ,
       26.5, 27. , 27.5, 28. , 28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5,
       32. , 32.5, 33. , 33.5, 34. ])

(60,)

This checks file continuity as we go, loading data at once. This will delay exceptions if you try to load data that is out of bounds.
We should consider placing a check before to validate files within a certain range.
We can easily approximate the range we need from the ref, offset, to read, and samplerate. Then we know in what range we need valid files (ref to start, ref to end).
If you run the check first, you wouldn't even need the inline checks.
This means the error messages will be a little worse, so perhaps it's worth keeping the checks where they are.
Give the user a way to manually run a check before long calculations anyhow!

In [201]:
self = recorder
ref_offset = 2
ref_time = self.time_window.start.add(seconds=ref_offset)
start_offset = 5
samples_to_read = int(self.samplerate * 30)
print(f"First sample should be {ref_offset + start_offset / self.samplerate}")
print(f"Last sample should be {ref_offset + (start_offset + samples_to_read - 1) / self.samplerate}")

# ref_time = self.time_window.start.subtract(seconds=10)
# start_offset = int(self.samplerate * 10)
# samples_to_read = int(self.samplerate * self.time_window.duration)
# def raw_data(self, ref_time=None, start_offset=None, samples_to_read=None):

samplerate = self.samplerate
# The files typically read the start time from the file name, but the stop time
# is only known after opening the file. This means we want to use the start time
# for the initial check, not the stop time.
# As such, we are looking for the last file with a start time before the ref time.
# print(f"Looking for ref time {ref_time}")
# for file in reversed(self.files):
#     if file.start_time <= ref_time:
#         break
# else:
#     raise ValueError(f"Cannot refer samples to {ref_time}, earliest file start is {file.start_time}")

for file_idx in reversed(range(len(self.files))):
    if self.files[file_idx].start_time <= ref_time:
        if self.files[file_idx].stop_time < ref_time:
            raise ValueError(
                f"Cannot refer samples to {ref_time}, latest file starting before ends before\n"
                f"{self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}\n"
            )
        break
else:
    raise ValueError(f"Cannot refer samples to {ref_time}, earliest file start is {self.files[file_idx].start_time}")


print(f"Found ref time in file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")

def check_file_continuity(early, late):
    interrupt = (late.start_time - early.stop_time).total_seconds()
    if interrupt > self.allowable_interrupt:
        raise ValueError(
            f"Data is not continuous, missing {interrupt} seconds between files\n "
            f"{early.filepath} ending at {early.stop_time}\n"
            f"{late.filepath} starting at {late.start_time}"
        )


start_idx = int(np.floor((ref_time - self.files[file_idx].start_time).total_seconds() * samplerate))
# The index in this file where the ref time is.

# Looking for the correct place to start reading.
# After this loop, reading will start at self.files[file_idx][start_idx]
# start_offset is how many samples earlier or later than the current (file_idx, start_idx) we should start.
if start_offset > 0:
    remaining_offset = start_offset
    # We should look forwards to find the first file
    while remaining_offset != 0:
        if self.files[file_idx].num_samples <= remaining_offset + start_idx:
            # Current start index + the remaining offset takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # Remove the remainder of the file from the start offset, point to the beginning of the next file.
            if file_idx == len(self.files) - 1:
                raise ValueError(f"Cannot read from this late")
            check_file_continuity(self.files[file_idx], self.files[file_idx + 1])  # TODO: replace this check with a dedicated one
            remaining_offset -= self.files[file_idx].num_samples - start_idx
            file_idx += 1
            start_idx = 0
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0
elif start_offset < 0:
    remaining_offset = start_offset
    # We should look backwards to find the first file
    while remaining_offset != 0:
        if start_idx + remaining_offset < 0:
            # Current start index + remaining offset (which is negative) takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # We want to point to the last sample in the previous file, which is start_idx + 1 earlier
            if file_idx == 0:
                raise ValueError("Cannot read from this early")
            check_file_continuity(self.files[file_idx - 1], self.files[file_idx])  # TODO: replace this check with a dedicated one
            remaining_offset += start_idx + 1
            file_idx -= 1
            start_idx = self.files[file_idx].num_samples - 1
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0

if self.files[file_idx].num_samples - start_idx > samples_to_read:
    # The data exists within this file
    data = self.files[file_idx].read_data(start_idx, start_idx + samples_to_read)
else:
    chunks = []
    remaining_samples = samples_to_read
    previous_end = self.files[file_idx].start_time
    for file in self.files[file_idx:]:
        interrupt = (file.start_time - previous_end).total_seconds()
        if interrupt > self.allowable_interrupt:
            raise ValueError(
                f"Data is not continuous, missing {interrupt} seconds before file {file.filepath}, "
                f"starting at {file.start_time} and the previous file ending at {previous_end}"
            )
        else:
            previous_end = file.stop_time

        chunk = file.read_data(start_idx=start_idx, stop_idx=min(file.num_samples, remaining_samples))
        remaining_samples -= chunk.shape[-1]
        chunks.append(chunk)
        start_idx = 0
        print(f"Read {chunk.shape[-1]} samples from file {self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}")
        if remaining_samples <= 0:
            break
    assert remaining_samples == 0

    data = np.concatenate(chunks, axis=-1)


display(data)
display(data.shape)


First sample should be 4.5
Last sample should be 34.0
Found ref time in file covering 2024-01-01 12:00:00+00:00 - 2024-01-01 12:00:10+00:00
First file to read is 2024-01-01 12:00:00+00:00 - 2024-01-01 12:00:10+00:00
Read 11 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 20 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 20 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 9 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00


array([ 4.5,  5. ,  5.5,  6. ,  6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5,
       10. , 10.5, 11. , 11.5, 12. , 12.5, 13. , 13.5, 14. , 14.5, 15. ,
       15.5, 16. , 16.5, 17. , 17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5,
       21. , 21.5, 22. , 22.5, 23. , 23.5, 24. , 24.5, 25. , 25.5, 26. ,
       26.5, 27. , 27.5, 28. , 28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5,
       32. , 32.5, 33. , 33.5, 34. ])

(60,)

This version includes a first check of file continuity, so that we don't have to bother with it later.

In [4]:
self = recorder
ref_offset = 9
ref_time = self.time_window.start.add(seconds=ref_offset)
start_offset = 0
samples_to_read = int(self.samplerate * 30)
print(f"First sample should be {ref_offset + start_offset / self.samplerate}")
print(f"Last sample should be {ref_offset + (start_offset + samples_to_read - 1) / self.samplerate}")

def find_file_time(time):
    time = uwacan._core.time_to_datetime(time)
    for file in reversed(self.files):
        # Going backwards since the files check the start time first,
        # so all files starting later will not have to check their stop time.
        # The stop time usually requires opening the file, which is slower.
        if time in file:
            return file
    else:
        raise ValueError(f"Time {time} does not exist inside any recorded files")

def check_file_continuity(start_time, stop_time):
    first_file = find_file_time(start_time)
    first_idx = self.files.index(first_file)
    last_file = find_file_time(stop_time)
    last_idx = self.files.index(last_file)

    for early, late in zip(self.files[first_idx:last_idx - 1], self.files[first_idx + 1: last_idx]):
        interrupt = (late.start_time - early.stop_time).total_seconds()
        if interrupt > self.allowable_interrupt:
            raise ValueError(
                f"Data is not continuous, missing {interrupt} seconds between files\n "
                f"{early.filepath} ending at {early.stop_time}\n"
                f"{late.filepath} starting at {late.start_time}"
            )

samplerate = self.samplerate

earliest = min(ref_time, ref_time.add(seconds=start_offset / samplerate))
latest = max(ref_time, ref_time.add(seconds=(start_offset + samples_to_read) / samplerate))
check_file_continuity(earliest, latest)

file_idx = self.files.index(find_file_time(ref_time))
print(f"Found ref time in file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
start_idx = int(np.floor((ref_time - self.files[file_idx].start_time).total_seconds() * samplerate))
# The index in this file where the ref time is.

# Looking for the correct place to start reading.
# After this loop, reading will start at self.files[file_idx][start_idx]
# start_offset is how many samples earlier or later than the current (file_idx, start_idx) we should start.
if start_offset > 0:
    remaining_offset = start_offset
    # We should look forwards to find the first file
    while remaining_offset != 0:
        if self.files[file_idx].num_samples <= remaining_offset + start_idx:
            # Current start index + the remaining offset takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # Remove the remainder of the file from the start offset, point to the beginning of the next file.
            remaining_offset -= self.files[file_idx].num_samples - start_idx
            file_idx += 1
            start_idx = 0
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0
elif start_offset < 0:
    remaining_offset = start_offset
    # We should look backwards to find the first file
    while remaining_offset != 0:
        if start_idx + remaining_offset < 0:
            # Current start index + remaining offset (which is negative) takes us outside this file -> skip
            print(f"Skipping file covering {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            # We want to point to the last sample in the previous file, which is start_idx + 1 earlier
            remaining_offset += start_idx + 1
            file_idx -= 1
            start_idx = self.files[file_idx].num_samples - 1
        else:
            print(f"First file to read is {self.files[file_idx].start_time} - {self.files[file_idx].stop_time}")
            start_idx += remaining_offset
            remaining_offset = 0

if self.files[file_idx].num_samples - start_idx > samples_to_read:
    # The data exists within this file
    data = self.files[file_idx].read_data(start_idx, start_idx + samples_to_read)
else:
    chunks = []
    remaining_samples = samples_to_read
    for file in self.files[file_idx:]:
        chunk = file.read_data(start_idx=start_idx, stop_idx=min(file.num_samples, remaining_samples))
        remaining_samples -= chunk.shape[-1]
        chunks.append(chunk)
        start_idx = 0
        print(f"Read {chunk.shape[-1]} samples from file {self.files[file_idx].start_time} -> {self.files[file_idx].stop_time}")
        if remaining_samples <= 0:
            break
    assert remaining_samples == 0

    data = np.concatenate(chunks, axis=-1)


display(data)
display(data.shape)


First sample should be 9.0
Last sample should be 38.5
Found ref time in file covering 2024-01-01 12:00:00+00:00 - 2024-01-01 12:00:10+00:00
Read 2 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 20 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 20 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00
Read 18 samples from file 2024-01-01 12:00:00+00:00 -> 2024-01-01 12:00:10+00:00


array([ 9. ,  9.5, 11. , 11.5, 12. , 12.5, 13. , 13.5, 14. , 14.5, 15. ,
       15.5, 16. , 16.5, 17. , 17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5,
       22. , 22.5, 23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. ,
       27.5, 28. , 28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 33. , 33.5,
       34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. ,
       39.5, 40. , 40.5, 41. , 41.5])

(60,)

In [13]:
(start_offset + samples_to_read) / samplerate

32.5

In [157]:
np.allclose(data, old)

True

In [55]:
files = iter(list(range(10)))
for file in files:
    print(f"{file} in first")
    if file > 4:
        break
print("broke")
for file in files:
    print(f"{file} in second")

0 in first
1 in first
2 in first
3 in first
4 in first
5 in first
broke
6 in second
7 in second
8 in second
9 in second


Getting data 0 to 20
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19.]
Getting data 25 to 45
[25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38. 39. 40. 41. 42.
 43. 44.]
Getting data 50 to 70
[50. 51. 52. 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67.
 68. 69.]
Getting data 75 to 95
[75. 76. 77. 78. 79. 80. 81. 82. 83. 84. 85. 86. 87. 88. 89. 90. 91. 92.
 93. 94.]
Getting data 100 to 120
[100. 101. 102. 103. 104. 105. 106. 107. 108. 109. 110. 111. 112. 113.
 114. 115. 116. 117. 118. 119.]
Getting data 125 to 145
[125. 126. 127. 128. 129. 130. 131. 132. 133. 134. 135. 136. 137. 138.
 139. 140. 141. 142. 143. 144.]
Getting data 150 to 170
[150. 151. 152. 153. 154. 155. 156. 157. 158. 159. 160. 161. 162. 163.
 164. 165. 166. 167. 168. 169.]
Getting data 175 to 195
[175. 176. 177. 178. 179. 180. 181. 182. 183. 184. 185. 186. 187. 188.
 189. 190. 191. 192. 193. 194.]
