In [None]:
import os
from solardatatools import DataHandler
import pandas as pd
import boto3
import dask
from dask import delayed, compute, config
from dask.distributed import Client, LocalCluster, performance_report
import json
import click
import tempfile
import csv
from solardatatools.dataio import load_cassandra_data
import numpy as np

In [None]:
def local_csv_to_dh(file):
    """
    Converts a local CSV file into a solar-data-tools DataHandler.
    Parameters:
    - file: Path to the CSV file.
    Returns:
    - A tuple of the file name and its corresponding DataHandler.
    """
    df = pd.read_csv(file, index_col=0)
    # Convert index from int to datetime object
    df.index = pd.to_datetime(df.index)
    dh = DataHandler(df)
    name = os.path.basename(file)
    return (name, dh)


def get_csvs_in_dir(folder_path):
    """
    Gets the csvs in a directory.
    Parameters:
    - folder_path: Directory containing the csvs.
    Returns:
    - An array of the csv file paths.
    """
    csvs = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            if filename.endswith('.csv'):
                csvs.append(file_path)
    return csvs

def run_job(data_result, track_times, local=True):
    """
    Processes a single unit of data using DataHandler.
    Parameters:
    - data_result: Tuple of the file name and its corresponding DataHandler.
    - track_times: Boolean to determine whether run times are added to the
                   output.
    Returns:
    - A dictionary containing the name of the data and the processed report.
    If there was an error with processing, only the name of the data is
    returned.
    """
    name = data_result[0]
    data_handler = data_result[1]
    column = None
    if not local:
        column = data_result[2]

    try:
        if local:
            data_handler.run_pipeline()
        else:
            data_handler.run_pipeline(power_col=column)
        report = data_handler.report(verbose=False, return_values=True)
        report["name"] = name
        if track_times:
            report["total_time"] = data_handler.total_time
    except:
        report = {}
        report["name"] = name
    return report

In [None]:
def generate_task_local(filename, track_times=True):
    """
    Generate the analysis task for a given local file. 
    
    Parameters:
    - filename: Name of the local file
    - track_times: Booleans to determine whether run times are added 
    to the output

    Returns:
    - A Dask delayed task object for the data analysis, which depends
    on the ingest task. 
    """
    task_ingest = delayed(local_csv_to_dh)(filename)
    task_analyze = delayed(run_job)(task_ingest, track_times)
    return task_analyze

In [None]:
def generate_tasks_directory(directory, track_times=True):
    """
    Generate the analysis tasks for a given directory containing csv's. 
    
    Parameters:
    - directory: Path of the directory containing csv's
    - track_times: Booleans to determine whether run times are added 
    to the output

    Returns:
    - A list of Dask delayed task objects for the data analysis, 
    each of which depends on an ingest task. 
    """
    result = []
    for filename in get_csvs_in_dir(directory):
        if "log.csv" not in filename:
            result.append(generate_task_local(filename))
    return result

In [None]:
def write_reports(reports, filename="log.csv"):
    """
    Aggregate reports and write output to a csv file. 
    
    Parameters:
    - reports: list of reports. Each report is a dictionary. 
    - filename: name of the output csv
    """
    if filename != "":
        with open(filename, "w") as fp:
            writer = csv.writer(fp)
            header = []
            for key in reports[0].keys():
                header.append(key)
            writer.writerow(header)
            for r in reports:
                writer.writerow(r.values())

# Visualize task graphs

In [None]:
obj_list = generate_tasks_directory("./")
aggregate_reports_task = delayed(write_reports)(obj_list, "log.csv")

In [None]:
aggregate_reports_task.visualize()

# Execute Task Graph

In [None]:
def execute_tasks(task_list):
    """
    Execute a list of tasks. 
    
    NOTE: The Dask cluster should be 
    intialized before calling this function. 
    
    Parameters:
    - task_list: A list of dask delayed object

    Returns:
    - A list of reports from execution
    """
    reports = compute(*task_list,)
    return reports

In [None]:
client = Client(threads_per_worker=4, n_workers=2)

#reports = execute_tasks(obj_list)
compute(aggregate_reports_task)


In [None]:
client.shutdown()

In [None]:
# TODO: we need functions to generate tasks from remote sources


# each node is a row in the record
# identifier for that file


In [None]:
def remote_site_to_dhs(site, track_times=True):
    """
    Converts a remote database site into a solar-data-tools DataHandler.
    Parameters:
    - site: remote site.
    Returns:
    - A tuple of the unique identifier and its corresponding DataHandler.
    """
    result = []
    df = load_cassandra_data(site, cluster_ip="54.176.95.208")
    dh = DataHandler(df, convert_to_ts=True)
    dh.data_frame_raw.index = dh.data_frame_raw.index.view("int")
    dh_keys = dh.keys
    for key in dh_keys:
        site = key[0][0]
        site = site.strip()
        system = key[0][1]
        system = system.strip()
        name = site + system
        column = key[1]
        task_analyze = delayed(run_job)((name, dh, column), track_times, False)
        result.append(task_analyze)
    return result

In [None]:
def generate_tasks_remote_database(db_list):
    """
    Generate the analysis tasks for remote database.

    Parameters:
    - db_list: Path of the directory containing a list of sites from remote database

    Returns:
    - A list of Dask delayed task objects for the data analysis,
    each of which depends on an ingest task.
    """
    result = []
    with open(db_list, "r") as file:
        for site in file:
            result.extend(remote_site_to_dhs(site))
    return result

In [None]:
db_list = "/Users/cclin/Documents/14798-SLAC Apache Beam Parallel Processing on AWS /resources/db_list"
obj_list = generate_tasks_remote_database(db_list)
obj_list

In [None]:
dask.compute(obj_list)