In [2]:
import os
import pandas as pd
import geopandas as gpd
import multiprocessing as mp
import dask.config
from aad.common.config import Config
from aad.data.loader import DataLoader
from aad.data.preprocessing import DataPreprocessor
from aad.data.annotation import DataAnnotator
from aad.data.sequences import DataSequencer
from aad.data.groundtruth import GroundTruthCollector
from aad.common.core_logging import ProcessLogger

from dask.distributed import Client
def main():
    config = Config()
    logger = ProcessLogger(config, 'logger')
    loader = DataLoader(config)
    # Load all data
    df_sensor, _, df_locations = loader.load_raw_data(label_load=False, location_load=True)
    n_workers: int = min(mp.cpu_count(), config.data_pipeline.NUM_WORKERS)
    dask.config.set({'temporary_directory': r'tmp'})
    # Start Dask cluster for the entire pipeline

    with Client(n_workers=2, threads_per_worker=1) as client:
        mutiplier = [3,6,9,12]
        for m in multiplier:
            OUTPUT_DIR = f'ff_data/output_120min_7h_{m}x'
            MODEL_DIR = f'model_120min_7h_{m}x'
            DATASET_DIR = f'dataset_120min_7h_{m}x'

            # Set environment variables for the Config class to pick up
            os.environ['OUTPUT_DIR'] = OUTPUT_DIR
            os.environ['MODEL_DIR'] = MODEL_DIR
            os.environ['DATASET_DIR'] = DATASET_DIR

            config = Config()
            config.data_pipeline.LOCAL_OFFSET_MINUTES = 420
            config.data_pipeline.WINDOW_DURATION_MINUTES = 120

            print(f'Processing with window size 120 minutes and offset 7h {m} x')
            
            # Preprocessing
            preprocessor = DataPreprocessor(config, multiplier=m, df_sensor=df_sensor,  logger=logger)
            preprocessor.preprocess_data(client=client)
            # Ground Truth Processing
            groundtruth_collector = GroundTruthCollector(config)
            df_groundtruth = groundtruth_collector.collect_groundtruth(start_end_offset_min=180)
            # Annotation (using ground truth as labels)
            annotator = DataAnnotator(config, df_labels=df_groundtruth, df_locations=df_locations, logger=logger)
            annotator.annotate_data(client=client)
            # Sequence creation
            sequencer = DataSequencer(config, logger=logger)
            sequencer.create_dataset(fit_scaler=True)
            %%

        for i in minutes:       
            OUTPUT_DIR = f'D:/ff_data/output_{i}min_0h'
            MODEL_DIR = f'model_{i}min_0h'
            DATASET_DIR = f'dataset_{i}min_0h'

            # Set environment variables for the Config class to pick up
            os.environ['OUTPUT_DIR'] = OUTPUT_DIR
            os.environ['MODEL_DIR'] = MODEL_DIR
            os.environ['DATASET_DIR'] = DATASET_DIR
            sequencer.create_dataset(multiplier=m, fit_scaler=True)
    
            sequencer.create_dataset(fit_scaler=True)

        for i in minutes:       
            OUTPUT_DIR = f'D:/ff_data/output_{i}min_0h'
            MODEL_DIR = f'model_{i}min_0h'
            DATASET_DIR = f'dataset_{i}min_0h'

            # Set environment variables for the Config class to pick up
            os.environ['OUTPUT_DIR'] = OUTPUT_DIR
            os.environ['MODEL_DIR'] = MODEL_DIR
            os.environ['DATASET_DIR'] = DATASET_DIR

main()

NameError: name 'multiplier' is not defined