In [21]:
# Prepare import of modules from parent directory.
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from tqdm.notebook import tqdm
import json

from studies import np_encoder
from preparations import things_provider

CSV_FILE = 'observations_example_timerange.csv'

def structure_data(csv_file: str) -> dict:
    # Parse csv file into pandas dataframe
    df = pd.read_csv(csv_file)

    # Sort by phenonemon_time
    df = df.sort_values(by=['phenomenon_time'])

    # Directory of things where each thing ID maps to a dict of the three datastream types for that thing
    thing_datastreams = {}

    # Split dataframe such that we have one dataframe per datastream_id
    queried_datastreams = {}
    for datastream_id in df['datastream_id'].unique():
        queried_datastreams[datastream_id] = df[df['datastream_id'] == datastream_id]
        
    tp = things_provider.ThingsProvider()
    things = tp.get_things()

    not_returned_by_db_counter = 0
    total_number_of_datastreams = 0
    total_number_of_relevant_datastreams = 0

    for thing in things:
        name = thing['name']
        datastreams = thing['Datastreams']
        for datastream in datastreams:
            total_number_of_datastreams += 1
            layer_name = datastream['properties']['layerName']
            if layer_name != 'primary_signal' and layer_name != 'cycle_second' and layer_name != 'signal_program':
                continue
            total_number_of_relevant_datastreams += 1
            id = datastream['@iot.id']
            if id not in queried_datastreams:
                not_returned_by_db_counter += 1
                continue
            if name not in thing_datastreams:
                thing_datastreams[name] = {}
            thing_datastreams[name][layer_name] = queried_datastreams[id]

    print('Total number of datastreams: ' + str(total_number_of_datastreams))
    print('Total number of relevant datastreams: ' + str(total_number_of_relevant_datastreams))
    print('Number of datastreams not queried: ' + str(not_returned_by_db_counter))
    
    return thing_datastreams
    
def reconstruct_cycles(datastreams: dict):
    """
    datastreams should be a dict with the following structure:
    {
        "primary_signal": pd.DataFrame,
        "cycle_second": pd.DataFrame,
        "signal_program": pd.DataFrame
    }
    """

    primary_signal_missing = False
    cycle_second_missing = False
    
    if 'signal_program' not in datastreams:
        signal_programs = None
    else:
        signal_programs = datastreams['signal_program']
    
    if 'primary_signal' not in datastreams:
        primary_signal_missing = True
    if 'cycle_second' not in datastreams:
        cycle_second_missing = True
    if primary_signal_missing or cycle_second_missing:
        return None, primary_signal_missing, cycle_second_missing, signal_programs is None, 0
    primary_signals = datastreams['primary_signal']
    cycle_seconds = datastreams['cycle_second']
    
    cycle_seconds_length = len(cycle_seconds)
    primary_signals_length = len(primary_signals)
    
    primary_signal_index = 0
    ticker_second = primary_signals.iloc[primary_signal_index]['phenomenon_time']
    result = primary_signals.iloc[primary_signal_index]['result']
    
    if primary_signal_index + 1 >= primary_signals_length:
        Exception('Not enough primary signals to reconstruct cycles.')
        
    upcoming_phenomenon_time = primary_signals.iloc[primary_signal_index + 1]['phenomenon_time']
    
    programs = {}
    cycles = []
    current_cycle = None

    cycle_second_index = 0
    cycle_second_phenonemon_time_start = None
    cycle_second_phenonemon_time_end = None
    
    # How many times we skipped cycles where the primary signal was missing
    skipped_cycles = 0
    
    while ticker_second < cycle_seconds.iloc[-1]['phenomenon_time']:
        if cycle_second_index + 1 >= cycle_seconds_length:
            # End of data ("+ 1") because we also need to have an end for the cycle
            break
        
        # First cycle
        if cycle_second_phenonemon_time_start is None:
            cycle_second_phenonemon_time_start = cycle_seconds.iloc[cycle_second_index]['phenomenon_time']
        if cycle_second_phenonemon_time_end is None:
            cycle_second_phenonemon_time_end = cycle_seconds.iloc[cycle_second_index + 1]['phenomenon_time']
        
        # Next cycle:
        if ticker_second >= cycle_second_phenonemon_time_end:
            if current_cycle is None:
                skipped_cycles += 1
            else:
                # Save current cycle
                cycles.append(current_cycle)
                current_cycle = None
            
            cycle_second_index += 1
            cycle_second_phenonemon_time_start = cycle_seconds.iloc[cycle_second_index]['phenomenon_time']
            cycle_second_phenonemon_time_end = cycle_seconds.iloc[cycle_second_index + 1]['phenomenon_time']
        
        if upcoming_phenomenon_time is not None and ticker_second >= upcoming_phenomenon_time:
            primary_signal_index += 1
            result = primary_signals.iloc[primary_signal_index]['result']
            if primary_signal_index + 1 >= primary_signals_length:
                upcoming_phenomenon_time = None
            else:
                upcoming_phenomenon_time = primary_signals.iloc[primary_signal_index + 1]['phenomenon_time']
            
        if current_cycle is None and ticker_second == cycle_second_phenonemon_time_start:
            current_cycle = {
                'start': cycle_second_phenonemon_time_start,
                'end': cycle_second_phenonemon_time_end,
                'results': []
            }
        
        if current_cycle is not None and ticker_second >= cycle_second_phenonemon_time_start:
            if upcoming_phenomenon_time is None:
                diff_upcoming = 999_999_999_999
            else:
                diff_upcoming = upcoming_phenomenon_time - ticker_second
                
            diff_cycle_end = cycle_second_phenonemon_time_end - ticker_second
            
            diff = min(diff_upcoming, diff_cycle_end)
            results_to_append = [result] * diff
            
            current_cycle['results'].extend(results_to_append)
    
            ticker_second += diff
            
        else:
            ticker_second += 1
        
    UNKNWON_PROGRAM_IDENTIFIER = 'unknown'
        
    if signal_programs is None or len(signal_programs) == 0:
        programs = {
            UNKNWON_PROGRAM_IDENTIFIER: cycles
        }
    else:
        current_program = UNKNWON_PROGRAM_IDENTIFIER
        signal_program_index = 0
        signal_program_phenomenon_time_start = None
        signal_program_phenomenon_time_end = None
        for cycle in cycles:
            # First program:
            if signal_program_phenomenon_time_start is None and signal_program_index < len(signal_programs):
                signal_program_phenomenon_time_start = signal_programs.iloc[signal_program_index]['phenomenon_time']
                current_program = str(signal_programs.iloc[signal_program_index]['result'])
            if signal_program_phenomenon_time_end is None and signal_program_index + 1 < len(signal_programs):
                signal_program_phenomenon_time_end = signal_programs.iloc[signal_program_index + 1]['phenomenon_time']
                
            # Next program:
            if signal_program_phenomenon_time_end is not None and cycle['start'] >= signal_program_phenomenon_time_end:
                signal_program_index += 1
                if signal_program_index < len(signal_programs):
                    signal_program_phenomenon_time_start = signal_programs.iloc[signal_program_index]['phenomenon_time']
                    current_program = str(signal_programs.iloc[signal_program_index]['result'])
                else:
                    signal_program_phenomenon_time_start = None
                    current_program = UNKNWON_PROGRAM_IDENTIFIER
                if signal_program_index + 1 < len(signal_programs):
                    signal_program_phenomenon_time_end = signal_programs.iloc[signal_program_index + 1]['phenomenon_time']
                else:   
                    signal_program_phenomenon_time_end = None
                    
            if cycle['start'] < signal_program_phenomenon_time_start and cycle['end'] > signal_program_phenomenon_time_start:
                # Cycle starts before new program starts and ends after new program starts (something wrong in data).
                continue
                
            if current_program not in programs:
                programs[current_program] = []
                
            programs[current_program].append(cycle)
    
    return programs, primary_signal_missing, cycle_second_missing, signal_programs is None, skipped_cycles

    
datastreams_per_thing = structure_data(CSV_FILE)
    
# Directory of things where each thing ID maps to a list of programs with cycles for that thing
thing_cycles = {}

primary_signal_missing_count = 0
cycle_second_missing_count = 0
signal_program_missing_count = 0

total_skipped_cycles = 0

for thing in tqdm(datastreams_per_thing):
    datastreams = datastreams_per_thing[thing]
    programs, primary_signal_missing, cycle_second_missing, signal_program_missing, skipped_cycles = reconstruct_cycles(datastreams)
    total_skipped_cycles += skipped_cycles
    if primary_signal_missing:
        primary_signal_missing_count += 1
    if cycle_second_missing:
        cycle_second_missing_count += 1
    if signal_program_missing:
        signal_program_missing_count += 1
    if programs is None:
        continue

    thing_cycles[thing] = programs

with open('observations_example_timerange_reconstructed_cycles.json', 'w') as f:
    json.dump(thing_cycles, f, indent=4, cls=np_encoder.NpEncoder)
    
print('Number of things with missing primary signal: ' + str(primary_signal_missing_count))
print('Number of things with missing cycle second: ' + str(cycle_second_missing_count))
print('Number of times we skipped cycles because primary signal was missing: ' + str(total_skipped_cycles))

    


Amount of things: 19844
Total number of datastreams: 79394
Total number of relevant datastreams: 59473
Number of datastreams not queried: 49272


  0%|          | 0/4820 [00:00<?, ?it/s]

Number of things with missing primary signal: 396
Number of things with missing cycle second: 141
Number of times we skipped cycles because primary signal was missing: 3508
