# Travel times playground

## Utils

In [1]:
from io import TextIOBase
import re


def chunk_read(f_obj: TextIOBase, sentinel: str, max_sentinel: int):
    """Read a file object in chunks
    Read the file object line by line. Each time a sentinel is detected, we increment
    a count. Once the count reaches max_sentinel, we have gatherered the required
    chunk and yield it.
    The function is inspired by this SO answer:
    https://stackoverflow.com/a/42964612/9723036
    NOTE: during chunking, we remove all the white spaces and tabs to reduce the
    memory load.
    :param f_obj: A file object from opening a text file.
    :type f_obj: TextIOBase
    :param sentinel: A string pattern (regex supported) to recognize a specific
        line.
    :type sentinel: str
    :param max_sentinel: Max number of appearance of sentinels allowed in a chunk.
        This is equivalent to a chunk size, but more meaningful than based on only
        line counts.
    :type max_sentinel: int
    :yield: A chunk of the file
    :rtype: Iterator[str]
    """
    cnt, chunk = 0, ''
    for line in f_obj:
        match = re.search(sentinel, line)
        if match:
            cnt += 1
        if cnt <= max_sentinel:
            chunk += line.strip()
        else:
            yield chunk
            cnt = 0
            chunk = line.strip()
    yield chunk

## Processing

In [5]:
import pandas as pd

ROUTE_DATA_RAW_DATASET_PATH = "../datasets/raw/eval_travel_times_formatted.json"
ROUTE_DATA_PROCESSED_DATASET_PATH = "../datasets/processed/eval_travel_times_formatted.json"

NUMBER_OF_CHUNKS = 10
travel_times = []
with open(ROUTE_DATA_RAW_DATASET_PATH, 'r') as f_obj:
    chunks_gen = chunk_read(f_obj, 'RouteID', 1)    
    for i in range(NUMBER_OF_CHUNKS):
        chunk = next(chunks_gen)
        travel_times.append(chunk)

travel_times
        

['{"RouteID_00092558-dece-4fb7-8d0d-7d0df3a4864e": {"AH": {"AH": 0,"AJ": 484.5,"AL": 399.3,"AN": 216.7,"AP": 94.1,"AS": 132.9,"AY": 40.6,"BA": 105.7,"BB": 316.8,"BO": 214.5,"BQ": 551.6,"BR": 158.3,"BU": 147.7,"BY": 185.2,"CC": 112.8,"CF": 231.7,"CG": 203.4,"CL": 284.3,"CO": 317.2,"CR": 147.7,"CW": 345.2,"CZ": 217.9,"DH": 231.2,"DQ": 185.9,"DT": 186.7,"DU": 129.3,"EC": 250.1,"EI": 232.4,"ER": 240,"EU": 292.7,"FC": 232.3,"FF": 68.5,"FI": 221.7,"FO": 79.7,"FQ": 276.4,"FT": 202.9,"FY": 114.4,"FZ": 72.2,"GC": 145.5,"GE": 429.9,"GH": 306.3,"GL": 460.9,"GQ": 305.4,"GV": 312.8,"GW": 267.1,"GX": 326.1,"GY": 344.2,"GZ": 325.7,"HH": 85.7,"HI": 144.4,"HJ": 245.2,"HL": 310.6,"HM": 71.1,"HO": 48,"HT": 136.2,"HV": 457.3,"IB": 117.6,"IL": 85.5,"IT": 386.8,"IV": 248.3,"IX": 313.3,"JL": 315.3,"JM": 211.4,"JN": 367,"JQ": 332.5,"JT": 295.5,"JY": 183.8,"KG": 457,"KO": 112.4,"LI": 177.8,"LS": 233.4,"MA": 353.8,"MD": 101.5,"MX": 297.8,"NJ": 285.3,"NQ": 164.8,"NS": 477.8,"NT": 56.6,"NY": 133.2,"OF": 219.5,"OI