In [2]:
# Prepare import of modules from parent directory.
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tqdm.notebook import tqdm
import json

from studies import np_encoder
from studies.cycles.lib import data_processing
from _config import OBSERVATIONS_CSV_FILE, RECONSTRUCTED_CYCLES_JSON_FILE

datastreams_per_thing = data_processing.structure_observation_data(OBSERVATIONS_CSV_FILE)
    
# Directory of things where each thing ID maps to a list of programs with cycles for that thing
thing_cycles = {}

# Counts for how many things are missing the primary signal, cycle second, or signal program observations.
primary_signal_missing_count = 0
cycle_second_missing_count = 0
signal_program_missing_count = 0

# Count of how many times we skipped cycles because the primary signal was missing.
total_skipped_cycles = 0

for thing in tqdm(datastreams_per_thing):
    datastreams = datastreams_per_thing[thing]
    
    cycles, primary_signal_missing, cycle_second_missing, skipped_cycles = data_processing.reconstruct_cycles(datastreams)
    total_skipped_cycles += skipped_cycles
    if primary_signal_missing:
        primary_signal_missing_count += 1
    if cycle_second_missing:
        cycle_second_missing_count += 1
        
    # Signal program observations are optional.
    if 'signal_program' not in datastreams:
        signal_program_observations = None
    else:
        signal_program_observations = datastreams['signal_program']
    if signal_program_observations is None:
        signal_program_missing_count += 1
    
    # Observation data missing
    if cycles is None:
        continue
    programs =  data_processing.reconstruct_programs(cycles, signal_program_observations)
    if programs is None:
        continue

    thing_cycles[thing] = programs

with open(RECONSTRUCTED_CYCLES_JSON_FILE, 'w') as f:
    json.dump(thing_cycles, f, indent=4, cls=np_encoder.NpEncoder)
    
print('Number of things with missing primary signal: ' + str(primary_signal_missing_count))
print('Number of things with missing cycle second: ' + str(cycle_second_missing_count))
print('Number of times we skipped cycles because primary signal was missing: ' + str(total_skipped_cycles))

Amount of things: 19844
Total number of datastreams: 79394
Total number of relevant datastreams: 59473
Number of datastreams not queried: 49166


  0%|          | 0/4875 [00:00<?, ?it/s]

Number of things with missing primary signal: 393
Number of things with missing cycle second: 148
Number of times we skipped cycles because primary signal was missing: 3540
