In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob, os
from sdt_dask.dask_tool.sdt_dask import SDTDask

In [3]:
SDTDask?

## Local Client example 
 use local csv dataplug and local client

### Instantiate a localfile data plug

In [4]:
from sdt_dask.dataplugs.csv_plug import LocalFiles

In [5]:
LocalFiles?

In [6]:
path = "../dataplugs/example_data/"
local_file_data_plug = LocalFiles(path_to_files=path)
local_file_keys = [(os.path.basename(fname)[:-4],) for fname in glob.glob(path + "*")]
local_file_keys

[('TAAI01129193',), ('TABC01142170',), ('TABD01110568',)]

In [7]:
local_file_data_plug.get_data(local_file_keys[1])

Loading file TABC01142170...


Unnamed: 0_level_0,ac_power_01,ac_power_02,ac_power_03
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-31 16:00:00,0.0767,0.0928,0.0928
2013-12-31 16:05:00,0.0695,0.0862,0.0819
2013-12-31 16:10:00,0.0626,0.0726,0.0653
2013-12-31 16:15:00,0.0546,0.0639,0.0613
2013-12-31 16:20:00,0.0418,0.0530,0.0516
...,...,...,...
2019-06-20 15:30:00,0.5872,0.5950,0.2445
2019-06-20 15:35:00,0.6454,0.6506,0.2748
2019-06-20 15:40:00,0.6220,0.6313,0.2645
2019-06-20 15:45:00,0.5259,0.5338,0.2238


### Set a local User-defined Client

In [8]:
from dask.distributed import Client

In [9]:
n_workers = 1
threads_per_worker = 1
total_system_memory = 8
memory_per_worker = total_system_memory / n_workers
local_client = Client(processes=False, memory_spill_fraction=False, memory_pause_fraction=False, memory_target_fraction=0.8, n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit=f"{memory_per_worker}GiB")
local_client.dashboard_link



'http://192.168.1.157:8787/status'

In [10]:
dask_tool = SDTDask(local_file_data_plug, local_client)

dask_tool.execute(local_file_keys, fix_shifts=True, verbose=True)

Loading file TAAI01129193...
Loading file TABD01110568...

            *********************************************
            * Solar Data Tools Data Onboarding Pipeline *
            *********************************************

            This pipeline runs a series of preprocessing, cleaning, and quality
            control tasks on stand-alone PV power or irradiance time series data.
            After the pipeline is run, the data may be plotted, filtered, or
            further analyzed.

            Authors: Bennet Meyers and Sara Miskovich, SLAC

            (Tip: if you have a mosek [https://www.mosek.com/] license and have it
            installed on your system, try setting solver='MOSEK' for a speedup)

            This material is based upon work supported by the U.S. Department
            of Energy's Office of Energy Efficiency and Renewable Energy (EERE)
            under the Solar Energy Technologies Office Award Number 38529.

            


task list: 100%|██████████████████████████████████| 7/7 [00:25<00:00,  3.70s/it]




total time: 25.93 seconds
--------------------------------
Breakdown
--------------------------------
Preprocessing              9.39s
Cleaning                   4.15s
Filtering/Summarizing      12.40s
    Data quality           0.26s
    Clear day detect       0.61s
    Clipping detect        3.36s
    Capacity change detect 8.17s


            *********************************************
            * Solar Data Tools Data Onboarding Pipeline *
            *********************************************

            This pipeline runs a series of preprocessing, cleaning, and quality
            control tasks on stand-alone PV power or irradiance time series data.
            After the pipeline is run, the data may be plotted, filtered, or
            further analyzed.

            Authors: Bennet Meyers and Sara Miskovich, SLAC

            (Tip: if you have a mosek [https://www.mosek.com/] license and have it
            installed on your system, try setting solver='MOSEK' for a sp

task list: 100%|██████████████████████████████████| 7/7 [00:25<00:00,  3.62s/it]




total time: 25.37 seconds
--------------------------------
Breakdown
--------------------------------
Preprocessing              9.94s
Cleaning                   4.20s
Filtering/Summarizing      11.23s
    Data quality           0.32s
    Clear day detect       0.59s
    Clipping detect        2.78s
    Capacity change detect 7.55s

Loading file TABC01142170...

            *********************************************
            * Solar Data Tools Data Onboarding Pipeline *
            *********************************************

            This pipeline runs a series of preprocessing, cleaning, and quality
            control tasks on stand-alone PV power or irradiance time series data.
            After the pipeline is run, the data may be plotted, filtered, or
            further analyzed.

            Authors: Bennet Meyers and Sara Miskovich, SLAC

            (Tip: if you have a mosek [https://www.mosek.com/] license and have it
            installed on your system, try se

task list: 100%|██████████████████████████████████| 7/7 [00:22<00:00,  3.16s/it]




total time: 22.10 seconds
--------------------------------
Breakdown
--------------------------------
Preprocessing              9.78s
Cleaning                   4.32s
Filtering/Summarizing      8.00s
    Data quality           0.32s
    Clear day detect       0.57s
    Clipping detect        2.26s
    Capacity change detect 4.85s


## AWS Fargate Client example
use LocalFile dataplug and fargate client

### Instantiate a pvdaq data plug


In [11]:
from sdt_dask.dataplugs.pvdaq_plug import PVDAQPlug

In [12]:
PVDAQPlug?

In [13]:
pvdaq_data_plug = PVDAQPlug()

### Set up a Fargate cluster client


In [14]:
from sdt_dask.clients.aws.fargate import Fargate

In [15]:
# Use should define an environment variable for secret key
PA_NUMBER = os.getenv("project-pa-number")
AWS_DEFAULT_REGION = os.getenv('AWS_DEFAULT_REGION')
ENVIRONMENT = {
    'AWS_ACCESS_KEY_ID' : os.getenv('AWS_ACCESS_KEY_ID'),
    'AWS_SECRET_ACCESS_KEY' : os.getenv('AWS_SECRET_ACCESS_KEY')
}

In [16]:
# The Tag, VPC, image, workers, threads per worker and environment need to be user defined and passed to the client class
TAGS = {
    "project-pa-number": PA_NUMBER,
    "project": "pvinsight"
}
VPC = "vpc-ab2ff6d3" # for us-west-2
IMAGE = "nimishy/sdt-windows:latest"

WORKERS = 3
THREADS_PER_WORKER = 1

In [17]:
fargate_client = Fargate().init_client(image=IMAGE, 
                               tags=TAGS, 
                               vpc=VPC, 
                               region_name=AWS_DEFAULT_REGION,
                               environment=ENVIRONMENT,
                               n_workers=WORKERS,
                               threads_per_worker=THREADS_PER_WORKER
                               )

[i] Initilializing Fargate Cluster ...


  next(self.gen)


[i] Initialized Fargate Cluster
[i] Initilializing Dask Client ...
[>] Dask Dashboard: http://54.188.106.107:8787/status


In [18]:
pvdaq_keys = [(34, 2011), (35, 2015), (51,2012)]
dask_tool = SDTDask(pvdaq_data_plug, fargate_client)
dask_tool.execute(pvdaq_keys, fix_shifts=True, verbose=True)

## Azure Client example