In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime as dt
from tqdm.notebook import tqdm
import multiprocessing as mp

import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

class apiData():
    '''
    Holds api data fetched from the Wiener Linien API
    Can clean, format and return datasets for specific lines
    '''

    def __init__(self, api_file) -> None:
        '''
        api_file: pandas compatible csv file containing response data from the Wiener Linien API (may be gzipped)
        '''
        self.df = pd.read_csv(api_file)
        self.results = pd.DataFrame(columns=['station', 'line', 'towards', 'result_df'])

    def getAvailable(self, filter = True):
        '''
        returns a dataframe with available lines and directions
        filter (bool): if set to true, filters the unique entries for special announcements
        '''
        available = self.df.groupby(['station', 'line', 'towards']).size().rename('count').reset_index()

        if filter:
            available = available.loc[~available['towards'].str.match('(.*\*)|(.*_)|.*(FFP2-MASKEN)|(.*\!)|.*NÄCHSTER|.*FAHRTBEHINDERUNG')]

        return available.sort_values('count', ascending=False)


    def getLineDir(self, station, line, direction):
        '''
        returns a formatted dataframe containing the defined line and direction
        '''
        temp_df = self.df.loc[(self.df['station'] == station) & (self.df['line'] == line) & (self.df['towards'] == direction)]
        temp_df = self.clean(temp_df)
        temp_df = self.format(temp_df)

        return temp_df

    def clean(self, sub_df, filter_regex = True, filter_dupes = True):
        '''
        Input is an already filtered dataframe for one station/line/direction combination.
        cleans up a dataframe (remove bad responses) and returns it
        In case there are duplicates (two entries at the exaxt same time) both entries are dropped

        filter_regex (bool): turns on/off regex filtering
        filter_dupes (bool): turns on/off duplicate filtering
        '''
        if filter_regex:
            sub_df = sub_df.loc[sub_df['countdown'].str.match('\[[0-9, ]*\]')]
        
        if filter_dupes:
            sub_df = sub_df.drop_duplicates(subset='time', keep=False)

        return sub_df

    def format(self, sub_df):
        '''
        returns a formatted dataframe ready for analysis
        '''
        # convert the countdown list into columns
        cntdwn = sub_df['countdown']
        cntdwn = pd.DataFrame([json.loads(x) for x in cntdwn])
        cntdwn = cntdwn.set_index(sub_df.index)

        sub_df = pd.concat([sub_df, cntdwn], axis=1).drop(columns='countdown')

        return sub_df

    def fetchResults(self, which, filter = True):
        '''
        returns a result dataframe after a previous track step

        which (list): list containing [station, line, direction]
        filter (bool): remove incomplete and warning tracks
        '''
        res_df = self.outer_train_info.loc[(self.outer_train_info['station'] == which[0]) & (self.outer_train_info['line'] == which[1]) & (self.outer_train_info['towards'] == which[2])]
        res_df = res_df.iloc[-1]['vehicles_df']

        if filter:
            res_df = res_df.loc[(res_df['complete'] == 1) & (res_df['warning'] == 0)]

        return res_df

    def trackMany(self, which, depth=2, max_diff=15., multithreaded=4):
        '''
        Tracks multiple station/line/direction combinations in one step.
        Stores results in self.outer_train_info . Can be accessed using fetchResults

        which (list): list of lists in the form of [[station, line, direction], [station, line, direction], ...]
        depth (int): first n countdown places to take into account. Higher order countdowns are ofter error prone
        max_diff (float): maximum time difference between two timestamps in the dataframe to be considered cohesive
        multithreaded (int): use multithreading when multiple tracks are to be performed at once. Value corresponds to the amount of threads used. If the value is 0 or False, multithreading is not used.
        '''

        self.outer_train_info = pd.DataFrame(columns=['vehicles_df', 'station', 'line', 'towards'])

        global wrapper
        def wrapper(l, progress=True):
            av = self.track(which=l, depth=depth, max_diff=max_diff, progress=progress)
            return {'vehicles_df': av, 'station':l[0], 'line':l[1], 'towards':l[2]}            


        if multithreaded:
            with mp.Pool(multithreaded) as p:
                r = list(tqdm(p.imap(wrapper, which), total=len(which), desc='Total Progress', mininterval=2))
            self.outer_train_info = self.outer_train_info.append(r)
            
        else:
            for l in which:
                 av = wrapper(l, True)
                 self.outer_train_info = self.outer_train_info.append({'vehicles_df': av, 'station':l[0], 'line':l[1], 'towards':l[2]}, ignore_index=True)
            

    def track(self, which, depth=2, max_diff=15., progress = True, position=None):
        '''
        Starts the train-tracking procedure for a given station/line/direction combination.
        Returns a results dataframe

        which (list): list of trains to track in the form of [station, line, direction]
        depth (int): first n countdown places to take into account. Higher order countdowns are ofter error prone
        max_diff (float): maximum time difference between two timestamps in the dataframe to be considered cohesive
        progress (bool): display tqdm progress bar
        '''

        logging.info(f"Processing {which}...")

        df = self.getLineDir(*which)

        all_vehicles = pd.DataFrame({'vehicle':[], 'arrived':[], 'active':[], 'lastPos':[]})
        prev_cntwn = list()

        for index, row in tqdm(df.iterrows(), total=len(df), disable= not progress, desc= f'Processing: {which}', leave=True, mininterval=2):
            logging.debug(f"processing index{index}")
            row = row.dropna()
            row = row.iloc[:4+depth]

        # if there are no trains in the all_vehicles, we're on the first line and we have all new vehicles.
            if len(all_vehicles) == 0:
                for i, cntdwn in enumerate(row[4:].values):
                    all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], cntdwn, i), 'arrived':0, 'active': 1, 'lastPos':i, 'trackStart': row['time']}, ignore_index=True)
            
            else:
                # if there was a change, first check if the time difference was within limits
                if row['time'] - prev_row['time'] < max_diff:
                    # do a tracking step
                    
                    if row[0] > prev_cntwn[0]:
                    #shift vehicle position by one if the next countdown is larger than the last one and set the first train to arrived

                        logging.debug("shift frame by one")
                        success = list()

                        for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                            all_vehicles.at[idx, 'lastPos'] -= 1
                            all_vehicles.at[idx, 'vehicle'].position -= 1
                            
                            if all_vehicles.at[idx, 'lastPos'] < 0:
                                all_vehicles.at[idx, 'arrived'] = 1
                                all_vehicles.at[idx, 'active'] = 0
                                all_vehicles.at[idx, 'vehicle'].arrive(row['time'])

                            else:
                                all_vehicles.at[idx, 'vehicle'].trackTime(row['time'], row[4:].values)

                        if len(row) == len(prev_row):
                            # if a train has arrived and the lengths of the arrays don't change, there must be a new vehicle.
                            all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], row.values[-1], len(row[4:].values) -1), 'arrived':0, 'active': 1, 'lastPos':len(row[4:].values) -1, 'trackStart': row['time']}, ignore_index=True)

                    else:
                        # keep track of the changed times
                        # add new trains if they appear (check len of na-free df)
                        logging.debug("time changed without frameshift")

                        success = list()
                        for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                            all_vehicles.at[idx, 'vehicle'].trackTime(row['time'], row[4:].values)


                        if len(row) > len(prev_row):
                            # if no train has arrived, but the length the countdown arrays has increased, there must be a new train as well.
                            all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], row.values[-1], len(row[4:].values) -1), 'arrived':0, 'active': 1, 'lastPos':len(row[4:].values) -1, 'trackStart': row['time']}, ignore_index=True)

                else:
                    # close all open trains and begin new ones
                    logging.debug(f"time differnce over threshold ({row['time'] - prev_row['time']}). closing all trains")
                    for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                        all_vehicles.at[idx, 'active'] = 0

                    for i, cntdwn in enumerate(row[4:].values):
                        #opening new vehicles
                        all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], cntdwn, i), 'arrived':0, 'active': 1, 'lastPos':i, "trackStart": row['time']}, ignore_index=True)
                    
            prev_row = row
            prev_cntwn = row[4:].values

        logging.info('Calculating parameters')

        for i, r in tqdm(all_vehicles.iterrows(), total=len(all_vehicles), disable= True):
            r['vehicle'].calculateDT()

        complete_res = pd.DataFrame({'countdown': [], 'start': [], 'end': [], 'complete': [], 'hour': []})

        for i, v in tqdm(all_vehicles.iterrows(), total=len(all_vehicles), disable= True):
            
            complete_res = complete_res.append(v['vehicle'].times, ignore_index=True)

        return complete_res
       

class vehicle():
    '''
    implements a vehicle class that is used internally to track vehicles across time
    '''
    def __init__(self, firstSeen, cntdwn, position) -> None:
        self.firstSeen = firstSeen # timestamp at which the vehicle was first seen
        self.arrivedAt = None # timestamp at which the vehicle actually arrived
        self.times = pd.DataFrame({'countdown': [cntdwn], 'start': [firstSeen], 'end': ['nan'], 'complete': [False], 'warning':[False]}) # dataframe containing countdown value, start of countdown value and end of countdown value
        self.times = self.times.astype({'countdown': int, 'start': float, 'end': float, 'complete': bool, 'warning': bool})
        self.position = position
        self.arrived = False # to keep track if the vehicle has arrived yet or is still pending

    def arrive(self, time):
        self.arrived = True
        self.arrivedAt = time
        self.position = -1
        self.times.at[self.times.index[-1], 'end'] = time
        if len(self.times) > 1:
            # consider the track complete if there was an entry before
            self.times.at[self.times.index[-1], 'complete'] = True

    def trackTime(self, time, cntwns):
        # refreshes the times dataframe based on a new timestamp and a new countdown array

        try:
            if not self.times.at[self.times.index[-1], 'countdown'] == cntwns[self.position]:
                # if the time changed at the position of the vehicle, set the end time and start a new row
                self.times.at[self.times.index[-1], 'end'] = time

                if len(self.times) > 1:
                    # the first one is always incomplete. afterwards, the tracking is assumed to be complete.
                    self.times.at[self.times.index[-1], 'complete'] = True

                if self.times.at[self.times.index[-1], 'countdown'] < cntwns[self.position]:
                    # in case the countdown increases at a given position, we cannot trust the data anymore.
                    self.times.at[self.times.index[-1], 'warning'] = True

                self.times = self.times.append({'countdown': cntwns[self.position], 'start': time, 'end': pd.NA, 'complete': False, 'warning': False}, ignore_index=True)
        except Exception as e:
            print(f"Exception {e} at time {time}. Continuing.")
            
    def calculateDT(self):
        self.times['dt'] = self.times['end'] - self.times['start']

        hours = list()
        for idx, row in self.times.iterrows():
            t = dt.fromtimestamp(row['start'])
            hours.append(t.hour)

        self.times['hour'] = hours



In [3]:
d = apiData('./data/scrape_subset_last5k.csv')
d.trackMany([['Längenfeldgasse', 'U6', 'SIEBENHIRTEN'],
    ['Längenfeldgasse', 'U4', 'HEILIGENSTADT']], multithreaded=0)

Processing: ['Längenfeldgasse', 'U6', 'SIEBENHIRTEN']:   0%|          | 0/401 [00:00<?, ?it/s]

Processing: ['Längenfeldgasse', 'U4', 'HEILIGENSTADT']:   0%|          | 0/411 [00:00<?, ?it/s]

In [123]:
d.getAvailable()[['station', 'line', 'towards']].values

array([['Längenfeldgasse', 'U6', 'SIEBENHIRTEN'],
       ['Längenfeldgasse', 'U4', 'HÜTTELDORF'],
       ['Längenfeldgasse', 'U6', 'FLORIDSDORF'],
       ['Flurschützstraße / Längenfeldgasse', '63A',
        'Gesundheitszentrum Süd'],
       ['Flurschützstraße / Längenfeldgasse', '63A', 'Am Rosenhügel'],
       ['Längenfeldgasse', 'U4', 'HEILIGENSTADT'],
       ['Flurschützstraße, Längenfeldgasse', '62', 'Oper, Karlsplatz U'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wien Oper'],
       ['Längenfeldgasse U', '12A', 'Schmelz, Gablenzgasse'],
       ['Flurschützstraße, Längenfeldgasse', '62',
        'Lainz, Wolkersbergenstraße'],
       ['Längenfeldgasse U', '12A', 'Eichenstraße'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Baden Josefspl.'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wiener Neudorf'],
       ['Längenfeldgasse U', '12A', 'Längenfeldgasse U'],
       ['Längenfeldgasse U', 'N8', 'Alterlaa U'],
       ['Flurschützstraße, Längenfeldgasse', 'N

In [3]:
d = apiData('./data/scrape.csv.gz')

d.trackMany([['Flurschützstraße / Längenfeldgasse', '63A', 'Am Rosenhügel'],
       ['Flurschützstraße / Längenfeldgasse', '63A',
        'Gesundheitszentrum Süd'],
       ['Flurschützstraße, Längenfeldgasse', '62', 'Oper, Karlsplatz U'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wien Oper'],
       ['Längenfeldgasse U', '12A', 'Eichenstraße'],
       ['Längenfeldgasse U', '12A', 'Schmelz, Gablenzgasse'],
       ['Längenfeldgasse', 'U4', 'HÜTTELDORF'],
       ['Längenfeldgasse', 'U6', 'FLORIDSDORF'],
       ['Längenfeldgasse', 'U4', 'HEILIGENSTADT'],
       ['Längenfeldgasse', 'U6', 'SIEBENHIRTEN'],
       ['Flurschützstraße, Längenfeldgasse', '62',
        'Lainz, Wolkersbergenstraße'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Baden Josefspl.'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wiener Neudorf'],
       ['Flurschützstraße, Längenfeldgasse', '62',
        'Meidling S, Dörfelstraße'],
       ['Flurschützstraße, Längenfeldgasse', '62',
        'Wattmanngasse, Betriebsbhf. Speising']], multithreaded=3)

Total Progress:   0%|          | 0/15 [00:00<?, ?it/s]

In [150]:
d = apiData('./data/scrape.csv.gz')
pd.options.display.max_rows = 400
pd.options.display.precision = 10
d.df.loc[d.df['time'].between(1638303216, 1638303516)]


Unnamed: 0,station,line,towards,countdown,time
467899,Längenfeldgasse U,12A,Längenfeldgasse U,"[3, 19, 34, 48, 63]",1638303222.7
467900,Längenfeldgasse U,12A,"Schmelz, Gablenzgasse","[2, 17, 32, 47, 62]",1638303222.7
467901,"Flurschützstraße, Längenfeldgasse",62,"Lainz, Wolkersbergenstraße","[2, 10, 20, 29, 40, 49, 59, 69]",1638303222.7
467902,"Flurschützstraße, Längenfeldgasse",62,"Oper, Karlsplatz U","[8, 17, 27, 38, 48, 61]",1638303222.7
467903,Flurschützstraße / Längenfeldgasse,63A,Am Rosenhügel,"[3, 14, 24, 34, 44, 54, 65]",1638303222.7
467904,Flurschützstraße / Längenfeldgasse,63A,Gesundheitszentrum Süd,"[7, 15, 28, 37, 47, 61]",1638303222.7
467905,Längenfeldgasse,U4,HEILIGENSTADT,"[2, 8, 15, 23, 30, 38, 45, 53, 60, 68]",1638303222.7
467906,Längenfeldgasse,U4,HÜTTELDORF,"[2, 13, 20, 28, 35, 43, 50, 58, 65]",1638303222.7
467907,Längenfeldgasse,U6,FLORIDSDORF,"[0, 6, 10, 15, 23, 30, 38, 45, 53, 60, 68]",1638303222.7
467908,Längenfeldgasse,U6,SIEBENHIRTEN,"[1, 8, 9, 13, 20, 24, 28, 35, 43, 50, 58, 65]",1638303222.7


In [140]:
d.outer_train_info

Unnamed: 0,vehicles_df,station,line,towards
0,countdown start end com...,Längenfeldgasse,U6,SIEBENHIRTEN
1,countdown start end com...,Längenfeldgasse,U4,HEILIGENSTADT


In [43]:
d.outer_train_info.iloc[1]['vehicles_df']

Unnamed: 0,countdown,start,end,complete,hour,warning,dt
0,2.0,1.638526e+09,1.638526e+09,0.0,11.0,False,31.163877
1,1.0,1.638526e+09,1.638526e+09,1.0,11.0,False,41.491544
2,0.0,1.638526e+09,1.638526e+09,1.0,11.0,False,187.010081
3,9.0,1.638526e+09,1.638526e+09,False,11.0,False,31.163877
4,8.0,1.638526e+09,1.638526e+09,True,11.0,False,82.882011
...,...,...,...,...,...,...,...
142,10.0,1.638530e+09,1.638530e+09,False,12.0,False,20.771417
143,9.0,1.638530e+09,1.638530e+09,True,12.0,False,104.023382
144,8.0,1.638530e+09,1.638530e+09,True,12.0,False,72.380788
145,7.0,1.638530e+09,1.638530e+09,True,12.0,False,31.044301


In [29]:
float('0')

0.0

In [11]:
d.df[['station', 'line', 'towards']]

Unnamed: 0,station,line,towards
0,"Flurschützstraße, Längenfeldgasse",62,"Oper, Karlsplatz U"
1,Flurschützstraße / Längenfeldgasse,63A,Am Rosenhügel
2,Flurschützstraße / Längenfeldgasse,63A,Gesundheitszentrum Süd
3,Längenfeldgasse,U4,HEILIGENSTADT
4,Längenfeldgasse,U4,HÜTTELDORF
...,...,...,...
4995,Längenfeldgasse,U4,HEILIGENSTADT
4996,Längenfeldgasse,U4,HÜTTELDORF
4997,Längenfeldgasse,U6,FLORIDSDORF
4998,Längenfeldgasse,U6,SIEBENHIRTEN


In [4]:
import pandas as pd
import numpy as np
import json
from datetime import datetime as dt
from tqdm.notebook import tqdm
import multiprocessing as mp

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import logging
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

class apiData():
    '''
    Holds api data fetched from the Wiener Linien API
    Can clean, format and return datasets for specific lines
    '''

    def __init__(self, api_file) -> None:
        '''
        api_file: pandas compatible csv file containing response data from the Wiener Linien API (may be gzipped)
        '''
        self.df = pd.read_csv(api_file)
        self.results = pd.DataFrame(columns=['station', 'line', 'towards', 'result_df'])

    def getAvailable(self, filter = True):
        '''
        returns a dataframe with available lines and directions
        filter (bool): if set to true, filters the unique entries for special announcements
        '''
        available = self.df.groupby(['station', 'line', 'towards']).size().rename('count').reset_index()

        if filter:
            available = available.loc[~available['towards'].str.match('(.*\*)|(.*_)|.*(FFP2-MASKEN)|(.*\!)|.*NÄCHSTER|.*FAHRTBEHINDERUNG')]

        return available.sort_values('count', ascending=False)


    def getLineDir(self, station, line, direction):
        '''
        returns a formatted dataframe containing the defined line and direction
        '''
        temp_df = self.df.loc[(self.df['station'] == station) & (self.df['line'] == line) & (self.df['towards'] == direction)]
        temp_df = self.clean(temp_df)
        temp_df = self.format(temp_df)

        return temp_df

    def clean(self, sub_df, filter_regex = True, filter_dupes = True):
        '''
        Input is an already filtered dataframe for one station/line/direction combination.
        cleans up a dataframe (remove bad responses) and returns it
        In case there are duplicates (two entries at the exaxt same time) both entries are dropped

        filter_regex (bool): turns on/off regex filtering
        filter_dupes (bool): turns on/off duplicate filtering
        '''
        if filter_regex:
            sub_df = sub_df.loc[sub_df['countdown'].str.match('\[[0-9, ]*\]')]
        
        if filter_dupes:
            sub_df = sub_df.drop_duplicates(subset='time', keep=False)

        return sub_df

    def format(self, sub_df):
        '''
        returns a formatted dataframe ready for analysis
        '''
        # convert the countdown list into columns
        cntdwn = sub_df['countdown']
        cntdwn = pd.DataFrame([json.loads(x) for x in cntdwn])
        cntdwn = cntdwn.set_index(sub_df.index)

        sub_df = pd.concat([sub_df, cntdwn], axis=1).drop(columns='countdown')

        return sub_df

    def fetchResults(self, which, filter = True):
        '''
        returns a result dataframe after a previous track step

        which (list): list containing [station, line, direction]
        filter (bool): remove incomplete and warning tracks
        '''
        res_df = self.outer_train_info.loc[(self.outer_train_info['station'] == which[0]) & (self.outer_train_info['line'] == which[1]) & (self.outer_train_info['towards'] == which[2])]
        res_df = res_df.iloc[-1]['vehicles_df']

        if filter:
            res_df = res_df.loc[(res_df['complete'] == 1) & (res_df['warning'] == 0)]

        return res_df

    def trackMany(self, which, depth=2, max_diff=15., multithreaded=4):
        '''
        Tracks multiple station/line/direction combinations in one step.
        Stores results in self.outer_train_info . Can be accessed using fetchResults

        which (list): list of lists in the form of [[station, line, direction], [station, line, direction], ...]
        depth (int): first n countdown places to take into account. Higher order countdowns are ofter error prone
        max_diff (float): maximum time difference between two timestamps in the dataframe to be considered cohesive
        multithreaded (int): use multithreading when multiple tracks are to be performed at once. Value corresponds to the amount of threads used. If the value is 0 or False, multithreading is not used.
        '''

        self.outer_train_info = pd.DataFrame(columns=['vehicles_df', 'station', 'line', 'towards'])

        global wrapper
        def wrapper(l, progress=True):
            p, l = l
            av = self.track(which=l, depth=depth, max_diff=max_diff, progress=progress, position=p+1)
            return {'vehicles_df': av, 'station':l[0], 'line':l[1], 'towards':l[2]}            


        if multithreaded:
            with mp.Pool(multithreaded) as p:
                r = list(tqdm(p.imap(wrapper, enumerate(which)), total=len(which), desc='Total Progress'))
                print("")
            self.outer_train_info = self.outer_train_info.append(r)
            
        else:
            for p, l in enumerate(which):
                 av = wrapper((p, l), True)
                 self.outer_train_info = self.outer_train_info.append({'vehicles_df': av, 'station':l[0], 'line':l[1], 'towards':l[2]}, ignore_index=True)
            

    def track(self, which, depth=2, max_diff=15., progress = True, position=None):
        '''
        Starts the train-tracking procedure for a given station/line/direction combination.
        Returns a results dataframe

        which (list): list of trains to track in the form of [station, line, direction]
        depth (int): first n countdown places to take into account. Higher order countdowns are ofter error prone
        max_diff (float): maximum time difference between two timestamps in the dataframe to be considered cohesive
        progress (bool): display tqdm progress bar
        '''

        logging.info(f"Processing {which}...")
        print(f"Starting to process {which}")

        df = self.getLineDir(*which)

        all_vehicles = pd.DataFrame({'vehicle':[], 'arrived':[], 'active':[], 'lastPos':[]})
        prev_cntwn = list()

        for index, row in tqdm(df.iterrows(), total=len(df), disable= not progress, desc= f'Processing: {which}', leave=True, position=position):
            logging.debug(f"processing index{index}")
            row = row.dropna()
            row = row.iloc[:4+depth]

        # if there are no trains in the all_vehicles, we're on the first line and we have all new vehicles.
            if len(all_vehicles) == 0:
                for i, cntdwn in enumerate(row[4:].values):
                    all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], cntdwn, i), 'arrived':0, 'active': 1, 'lastPos':i, 'trackStart': row['time']}, ignore_index=True)
            
            else:
                # if there was a change, first check if the time difference was within limits
                if row['time'] - prev_row['time'] < max_diff:
                    # do a tracking step
                    
                    if row[0] > prev_cntwn[0]:
                    #shift vehicle position by one if the next countdown is larger than the last one and set the first train to arrived

                        logging.debug("shift frame by one")
                        success = list()

                        for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                            all_vehicles.at[idx, 'lastPos'] -= 1
                            all_vehicles.at[idx, 'vehicle'].position -= 1
                            
                            if all_vehicles.at[idx, 'lastPos'] < 0:
                                all_vehicles.at[idx, 'arrived'] = 1
                                all_vehicles.at[idx, 'active'] = 0
                                all_vehicles.at[idx, 'vehicle'].arrive(row['time'])

                            else:
                                all_vehicles.at[idx, 'vehicle'].trackTime(row['time'], row[4:].values)

                        if len(row) == len(prev_row):
                            # if a train has arrived and the lengths of the arrays don't change, there must be a new vehicle.
                            all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], row.values[-1], len(row[4:].values) -1), 'arrived':0, 'active': 1, 'lastPos':len(row[4:].values) -1, 'trackStart': row['time']}, ignore_index=True)

                    else:
                        # keep track of the changed times
                        # add new trains if they appear (check len of na-free df)
                        logging.debug("time changed without frameshift")

                        success = list()
                        for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                            all_vehicles.at[idx, 'vehicle'].trackTime(row['time'], row[4:].values)


                        if len(row) > len(prev_row):
                            # if no train has arrived, but the length the countdown arrays has increased, there must be a new train as well.
                            all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], row.values[-1], len(row[4:].values) -1), 'arrived':0, 'active': 1, 'lastPos':len(row[4:].values) -1, 'trackStart': row['time']}, ignore_index=True)

                else:
                    # close all open trains and begin new ones
                    logging.debug(f"time differnce over threshold ({row['time'] - prev_row['time']}). closing all trains")
                    for idx, v in all_vehicles.loc[all_vehicles['active'] == 1].iterrows():
                        all_vehicles.at[idx, 'active'] = 0

                    for i, cntdwn in enumerate(row[4:].values):
                        #opening new vehicles
                        all_vehicles = all_vehicles.append({'vehicle': vehicle(row['time'], cntdwn, i), 'arrived':0, 'active': 1, 'lastPos':i, "trackStart": row['time']}, ignore_index=True)
                    
            prev_row = row
            prev_cntwn = row[4:].values

        logging.info('Calculating parameters')

        for i, r in tqdm(all_vehicles.iterrows(), total=len(all_vehicles), disable= True):
            r['vehicle'].calculateDT()

        complete_res = pd.DataFrame({'countdown': [], 'start': [], 'end': [], 'complete': [], 'hour': []})

        for i, v in tqdm(all_vehicles.iterrows(), total=len(all_vehicles), disable= True):
            
            complete_res = complete_res.append(v['vehicle'].times, ignore_index=True)

        return complete_res
       

class vehicle():
    '''
    implements a vehicle class that is used internally to track vehicles across time
    '''
    def __init__(self, firstSeen, cntdwn, position) -> None:
        self.firstSeen = firstSeen # timestamp at which the vehicle was first seen
        self.arrivedAt = None # timestamp at which the vehicle actually arrived
        self.times = pd.DataFrame({'countdown': [cntdwn], 'start': [firstSeen], 'end': ['nan'], 'complete': [False], 'warning':[False]}) # dataframe containing countdown value, start of countdown value and end of countdown value
        self.times = self.times.astype({'countdown': int, 'start': float, 'end': float, 'complete': bool, 'warning': bool})
        self.position = position
        self.arrived = False # to keep track if the vehicle has arrived yet or is still pending

    def arrive(self, time):
        self.arrived = True
        self.arrivedAt = time
        self.position = -1
        self.times.at[self.times.index[-1], 'end'] = time
        if len(self.times) > 1:
            # consider the track complete if there was an entry before
            self.times.at[self.times.index[-1], 'complete'] = True

    def trackTime(self, time, cntwns):
        # refreshes the times dataframe based on a new timestamp and a new countdown array

        try:
            if not self.times.at[self.times.index[-1], 'countdown'] == cntwns[self.position]:
                # if the time changed at the position of the vehicle, set the end time and start a new row
                self.times.at[self.times.index[-1], 'end'] = time

                if len(self.times) > 1:
                    # the first one is always incomplete. afterwards, the tracking is assumed to be complete.
                    self.times.at[self.times.index[-1], 'complete'] = True

                if self.times.at[self.times.index[-1], 'countdown'] < cntwns[self.position]:
                    # in case the countdown increases at a given position, we cannot trust the data anymore.
                    self.times.at[self.times.index[-1], 'warning'] = True

                self.times = self.times.append({'countdown': cntwns[self.position], 'start': time, 'end': pd.NA, 'complete': False, 'warning': False}, ignore_index=True)
        except Exception as e:
            print(f"Exception {e} at time {time}. Continuing.")
            
    def calculateDT(self):
        self.times['dt'] = self.times['end'] - self.times['start']

        hours = list()
        for idx, row in self.times.iterrows():
            t = dt.fromtimestamp(row['start'])
            hours.append(t.hour)

        self.times['hour'] = hours

#d = apiData('./data/scrape.csv.gz')
d = apiData('./data/scrape_subset_last5k.csv')
d.trackMany([['Flurschützstraße / Längenfeldgasse', '63A', 'Am Rosenhügel'],
       ['Flurschützstraße / Längenfeldgasse', '63A',
        'Gesundheitszentrum Süd'],
       ['Flurschützstraße, Längenfeldgasse', '62', 'Oper, Karlsplatz U'],
       ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wien Oper'],
       ['Längenfeldgasse U', '12A', 'Eichenstraße']], multithreaded=2)

Starting to process ['Flurschützstraße / Längenfeldgasse', '63A', 'Am Rosenhügel']
Starting to process ['Flurschützstraße / Längenfeldgasse', '63A', 'Gesundheitszentrum Süd']


Total Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Processing: ['Flurschützstraße / Längenfeldgasse', '63A', 'Am Rosenhügel']:   0%|          | 0/417 [00:00<?, ?…

Processing: ['Flurschützstraße / Längenfeldgasse', '63A', 'Gesundheitszentrum Süd']:   0%|          | 0/417 [0…

Starting to process ['Flurschützstraße, Längenfeldgasse', '62', 'Oper, Karlsplatz U']


Processing: ['Flurschützstraße, Längenfeldgasse', '62', 'Oper, Karlsplatz U']:   0%|          | 0/417 [00:00<?…

Starting to process ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wien Oper']


Processing: ['Flurschützstraße, Längenfeldgasse', 'WLB', 'Wien Oper']:   0%|          | 0/416 [00:00<?, ?it/s]

Starting to process ['Längenfeldgasse U', '12A', 'Eichenstraße']


Processing: ['Längenfeldgasse U', '12A', 'Eichenstraße']:   0%|          | 0/416 [00:00<?, ?it/s]


