In [1]:
from sdt_dask.dataplugs.csv_plug import LocalFiles
import glob
import os

path = "c:\\Users\\Zander\\Documents\\spw_sensor_0\\"
dataplug = LocalFiles(path_to_files=path)

KEYS = [(os.path.basename(fname)[:-4],) for fname in glob.glob(path + "*")]
KEYS = KEYS[:4]
KEYS

[('001C4B0008A5',), ('001C4B001069',), ('0022F2006723',), ('0022F20139C6',)]

In [2]:
from sdt_dask.clients.local_client import LocalClient

client_setup = LocalClient(workers=4, threads=1, memory=6.0)
client = client_setup.init_client()
client


Dask Dashboard Link: http://127.0.0.1:8787/status


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 24.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:57611,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 24.00 GiB

0,1
Comm: tcp://127.0.0.1:57633,Total threads: 1
Dashboard: http://127.0.0.1:57635/status,Memory: 6.00 GiB
Nanny: tcp://127.0.0.1:57614,
Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-teh8r3dm,Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-teh8r3dm

0,1
Comm: tcp://127.0.0.1:57639,Total threads: 1
Dashboard: http://127.0.0.1:57640/status,Memory: 6.00 GiB
Nanny: tcp://127.0.0.1:57615,
Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-tpn9r7jx,Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-tpn9r7jx

0,1
Comm: tcp://127.0.0.1:57630,Total threads: 1
Dashboard: http://127.0.0.1:57631/status,Memory: 6.00 GiB
Nanny: tcp://127.0.0.1:57616,
Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-sw34dabv,Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-sw34dabv

0,1
Comm: tcp://127.0.0.1:57634,Total threads: 1
Dashboard: http://127.0.0.1:57636/status,Memory: 6.00 GiB
Nanny: tcp://127.0.0.1:57617,
Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-_49ddom1,Local directory: C:\Users\Zander\AppData\Local\Temp\dask-scratch-space\worker-_49ddom1


In [3]:
import dask.bag as db
import pandas as pd
from dask import delayed
from dask.distributed import performance_report, print
from solardatatools import DataHandler

In [4]:
def get_data(key):
    """Creates a dataframe for a key that includes the errors for the
    functions performed on the file and its datahandler.

    :param key: The key combination of the file
    :type key: tuple
    :return: Returns the dataframe for the key and its datahandler
    :rtype: tuple"""

    # Addition of function error status for key
    errors = {
        "get_data error": ["No error"],
        "run_pipeline error": ["No error"],
        "run_pipeline_report error": ["No error"],
        "run_loss_analysis error": ["No error"],
        "run_loss_analysis_report error": ["No error"],
    }

    # Creates a key dictionary for the key combinations and the sensor
    # information
    key_dict = {}
    for i in range(len(key)):
        key_dict[f"key_field_{i}"] = key[i]

    # Combines the key and errors into a dictionary and generates a
    # dataframe
    data_dict = {**key_dict, **errors}
    data_df = pd.DataFrame.from_dict(data_dict)

    try:
        # Reads CSV file and creates dataframe, Here the function reads
        # data into pandas and python instead of using dask to store the
        # data in memory for further computation **.
        df = dataplug.get_data(key)
        dh = DataHandler(df)
        return (data_df, dh)
    except Exception as e:
        data_df["get_data error"] = str(e)
        return (data_df,)

In [5]:
def run_pipeline(data_tuple, **kwargs):
    """Function runs the pipeline and appends the resutls to the
    dataframe. The function also stores the exceptions for the function
    call into its respective errors

    :param data_tuple: The tuple consists of the dataframe and datahandler
    :type data_tuple: tuple
    :param kwargs: The keyword arguments passed to the datahandler's
        run_pipeline
    :type kwargs: dict
    :return: tuple containing key dataframe and datahandler
    :rtype: tuple
    """

    # Assigns the dataframe, the first element of the tuple.
    data_df = data_tuple[0]

    # Change the errors if no datahandler is created
    if data_df.iloc[0]["get_data error"] != "No error":
        error = "get_data error lead to nothing"
        data_df["run_pipeline error"] = error
        data_df["run_pipeline_report error"] = error
        data_df["run_loss_analysis error"] = error
        data_df["run_loss_analysis_report error"] = error
        return (data_df,)

    # Calls datahndler's run_pipeline and handles errors
    else:
        datahandler = data_tuple[1]

        try:
            datahandler.run_pipeline(**kwargs)
            if datahandler.num_days <= 365:
                data_df["run_loss_analysis error"] = (
                    "The length of data is less than or equal to 1 year, loss analysis will fail thus is not performed."
                )
                data_df["run_loss_analysis_report error"] = (
                    "Loss analysis is not performed"
                )

        except Exception as e:
            data_df["run_pipeline error"] = str(e)
            error = "Failed because of run_pipeline error"
            data_df["run_loss_analysis error"] = error
            data_df["run_pipeline_report error"] = error
            data_df["run_loss_analysis_report error"] = error

    # Gets the run_pipeline report and appends it to the dataframe as
    # columns and handles errors
    if data_df.iloc[0]["run_pipeline error"] == "No error":
        try:
            report = datahandler.report(return_values=True, verbose=False)
            data_df = data_df.assign(**report)
        except Exception as e:
            data_df["run_pipeline_report error"] = str(e)
            print(e)
        # Gets the runtime for run_pipeline
        try:
            data_df["runtimes"] = datahandler.total_time
        except Exception as e:
            print(e)

    return (data_df, datahandler)

In [6]:
def run_loss_analysis(data_tuple):
    """Runs the Loss analysis on the pipeline, handles errors and saves
    the loss report results by appending it to the key dataframe. All
    errors are assigned to the key dataframe in error reports.

    :param data_tuple: A tuple containing the key dataframe and the
        datahandler object.
    :type data_tuple: tuple
    :return: key dataframe with appended reports and assigned error values
    :rtype: Pandas DataFrame"""
    data_df = data_tuple[0]

    if data_df.iloc[0]["run_loss_analysis error"] == "No error":
        datahandler = data_tuple[1]
        try:
            datahandler.run_loss_factor_analysis(verbose=True)
        except Exception as e:
            data_df["run_loss_analysis error"] = str(e)
            error = "Failed because of run_loss_analysis error"
            data_df["run_loss_analysis_report error"] = error
        try:
            loss_report = datahandler.loss_analysis.report()
            data_df = data_df.assign(**loss_report)
        except Exception as e:
            data_df["run_loss_analysis_report error"] = str(e)

    return data_df

# Performance Without Dask

In [None]:
%%time
results = []
for key in KEYS:
    data_tuple_0 = get_data(key)
    data_tuple_1 = run_pipeline(data_tuple_0, fix_shifts=True, verbose=False)
    result_df = run_loss_analysis(data_tuple_1)
    results.append(result_df)
df = pd.concat(results)
df

# ~Dask Bag Version~
Dask bag documentation states that **Bags efficiently utilizes the multiprocessor scheduling**.
The use of Dask Bags and Pandas Dataframes allows expandability of the dask runner tool as both dataframes and bags can hold and work on a large data. This means that each file's report(s) is stored as Dataframe row and more values can be appended to the Dataframe row which can be effectively taken advantage of when expanding features and reports in the runner tool.
## Performance Variation with Dask Bags and Partition Numbers
This section records the change in runtimes and dask bag computations for various files. The dask configuration includes 4 workers 1 thread and 6 GB of memory per worker.

| Files | Tasks  | Partitions | Duration   | Min_Runtime | Max_Runtime | Method             |
|-------|--------|------------|------------|-------------|-------------|--------------------|
| 4     | -      | -          | 58.6s      | -           | -           | -                  |
| **4** | **13** | -          | **42.30s** | 13.09s      | 19.49s      | **Delayed**        | here only the functions are delayed
| **4** | **13** | -          | **59.75s** | 20.65s      | 32.73s      | **Delayed (data)** | here the data is also delayed
| 4     | 3      | 1          | 97.48s     | 4.09s       | 10.34s      | Bags               |
| 4     | 12     | 4          | 51.98s     | 4.45s       | 18.90s      | Bags               |
| 4     | 12     | 8          | 48.74s     | 5.08s       | 16.33s      | Bags               |
| 4     | 12     | 10         | 48.28s     | 5.07s       | 15.81s      | Bags               |
| **4** | **12** | **16**     | **48.25s** | 4.72s       | 14.98s      | **Bags**           |
| **4** | **4**  | -          | **34.72s** | 84.48s      | 92.65s      | **Futures**        |
| 4     | 4      | -          | 112.63s    | 79.81s      | 86.96s      | Futures            |

Here different Dask methods are called on 4 files to understand the performance of each method. 
The first row contains the execution time for computing the 4 files only using python.
The next 2 rows compute the data using delayed. The difference is that in the second row we only delay the function calls and not the data. In the third row, the results after each function are also delayed.

Observations for 4 files and different n_partition values
   * for 4 files, 3 tasks and 1 partition dask computation takes **97.48s**. each task stream is computed separately and only 1 worker of the 4 workers is used. run_pipeline runtimes are very low ranging between 4.09s and 10.34s. As only one worker is computing the task graph it only computes 3 tasks in a loop for each file.
   * for 4 files, 12 tasks and 4 partitions dask computation takes **51.98s**. all workers are computing tasks simultaneously. The run_pipeline runtimes for each file ranges between 4.45s and 18.9s.
   * for 4 files, 12 tasks and 8 partitions dask computation takes **48.74s**. all workers are active and computing simultaneously. The run_pipeline runtimes for each file ranges between 5.08s and 16.33s.
   * for 4 files, 12 tasks and 10 partitions dask computation takes **48.28s**. all workers are active and computing simultaneously. The run_pipeline runtimes for each file ranges between 5.07 and 15.81s.
   * for 4 files, 12 tasks and 16 partitions dask computation takes **48.25s**. all workers are active and computing simultaneously. The run_pipeline runtimes for each file ranges between 4.72s and 14.98s.


| Files  | Tasks  | Partitions | Duration    | Min_Runtime | Max_Runtime | Method |
|--------|--------|------------|-------------|-------------|-------------|--------|
| 12     | 12     | 4          | 484.70s     | 4.43s       | 44.46s      | Bags   |
| **12** | **36** | **12**     | **360.10s** | 5.56s       | 47.16s      | Bags   | 
| 12     | 36     | 24         | 362.65s     | 6.55s       | 46.63s      | Bags   |
| 12     | 36     | 48         | 370.56s     | 6.05s       | 45.49s      | Bags   |  

Observations for 12 files and different n_partition values
   - for 12 files, 12 tasks and 4 partitions dask computation takes **484.70s**. all workers are computing tasks simultaneously. The run_pipeline runtimes for each file ranges between 4.43s and 44.46s. Since partitions are less than the number of files each partitions takes 3 file tasks and computes them in a loop. This observation can also be inferred for the 4 files and 1 n_partitions case.
   - for 12 files, 36 tasks and 12 partitions dask computation takes **360.10**. all workers are computing tasks simultaneously. The run_pipeline runtimes for each file ranges between 5.56s and 47.16s.
   - for 12 files, 36 tasks and 24 partitions dask computation takes **362.65s**. all workers are computing tasks simultaneously. The run_pipeline runtimes for each file ranges between 6.55s and 46.63s.
   - for 12 files, 36 tasks and 48 partitions dask computation takes **370.56**. all workers are computing tasks simultaneously. The run_pipeline runtimes for each file ranges between 6.05s and 45.49s.
    
## Pending...
Dask Recommends dividing partitions so that each partition memory is between 100-300 MB.

From the above observations we can confirm that for efficient Dask computation npartitions should be twice the number of KEYS. Running the test by passing around the DataFrames optimizes the runner tool enabling more efficiency when partitioning computation. The most optimized number for n_partitions is **2 or times the number of keys** as seen by a large files test results. Larger partitions would increase transfer and compute time making it less effective for computation.
If in the future more DataHandler computations are added please test the above-mentioned assumption to confirm if that is the cause for optimization.

## Performance Comparisons Between Dask Delayed and Dask Bag
For this test we've used the following configurations:
- **Workers**: 4
- **Threads**: 1
- **Memory**: 6
- All 137 CSV Files were computed for the test using the Local Cluster

The following observations were made while comparing the Dask Reports for Delayed Runner Tool and Dask Bags Runner Tool. The runner tool in this file utilizes pandas DataFrame to keep track of the DataHandlers and their errors. The idea is that Dask Bags maps the functions to the workers, meaning that instead of passing around data, the workers are passed to the data. ***Development with dask futures is not recommended as it isn't effective over a large number of inputs***

| Files | Tasks | Partitions | Duration | Compute_Time | Transfer_Time |
|-------|-------|------------|----------|--------------|---------------|
| 137   | 686   | -          | 76m 27s  | 5h 02m       | 287.99ms      |
| 137   | 411   | 137        | 76m 25s  | 5h 04m       | 48.13ms       |
| 137   | 411   | 274        | 78m 04s  | 5h 10m       | 09.59ms       |
| 137   | 411   | 584        | 75m 29s  | 5h 00m       | 67.23ms       |  

- Dask Delayed Duration takes **76m 27s** , number of tasks **686**, compute time **5hr 2m**, Transfer Time: **287.99ms**. No Data Transfers were observed.
- Dask Bags Duration with 137 npartitions takes **76m 25s**, number of tasks **411**, compute time **5hr 4m**, Transfer Time: **48.13ms**. 3 data-transfers were observed. File runtime ranges between 5.06s and 54.65s
- Dask Bags Duration with 274 (137*2) npartitions takes **78m 04s**, number of tasks **411**, compute time **5hr 10m**, Transfer Time: **9.59ms**. 7 data-transfers were observed. File runtime ranges between 5.49s and 48.51s
- Dask Bags Duration with 584 npartitions takes **75m 29s**, number of tasks **411**, compute time **5hr 00m**, Transfer Time: **67.23ms**. 6 data-transfers were observed. File runtime ranges between 6.18s and 50.67s


In [33]:
# The npartition variable defines the number of Pandas DataFrame that composes a single Dask DataFrame.
# The npartitions need to be enough to allow effective core usage. It is recommended that npartitions be a few times more than the number of cores/workers stated.
part = len(KEYS) * 4
dask_bag = db.from_sequence(KEYS, npartitions=part)
dask_bag = dask_bag.map(get_data)
dask_bag = dask_bag.map(run_pipeline, fix_shifts=True, verbose=False)
dask_bag = dask_bag.map(run_loss_analysis)
# result_bag = dask_bag.compute()
# result_df = pd.concat(result_bag)
# result_df

In [34]:
from time import strftime

timestamp = strftime("%Y%m%d-%H%M%S")
with performance_report(f"../results/dask_report_{timestamp}-{len(KEYS)}-{part}p.html"):
    result_bag = dask_bag.compute()
    result_df = pd.concat(result_bag)
    result_df.to_csv(f"../results/summary_report_{timestamp}-{len(KEYS)}-{part}p.csv")
    display(result_df)

Unnamed: 0,key_field_0,get_data error,run_pipeline error,run_pipeline_report error,run_loss_analysis error,run_loss_analysis_report error,length,capacity,sampling,quality score,...,runtimes,degradation rate [%/yr],deg rate lower bound [%/yr],deg rate upper bound [%/yr],total energy loss [kWh],degradation energy loss [kWh],soiling energy loss [kWh],capacity change energy loss [kWh],weather energy loss [kWh],system outage loss [kWh]
0,001C4B0008A5,No error,No error,No error,No error,No error,1.082192,3.0012,15,0.918987,...,32.065503,-9.145079,-13.016003,-5.760906,-1822868.0,-345117.215781,-89326.273306,-627.176715,-884933.350488,-502863.931619
0,001C4B001069,No error,No error,No error,No error,No error,1.090411,2.87535,15,0.932161,...,30.086473,-5.990663,-11.521941,-2.578078,-1408609.0,-209369.137545,-100493.109595,-687.239896,-748430.245619,-349629.70922
0,0022F2006723,No error,No error,No error,The length of data is less than or equal to 1 ...,Loss analysis is not performed,0.613699,6.34,15,0.857143,...,23.882537,,,,,,,,,
0,0022F20139C6,No error,No error,No error,No error,No error,1.479452,6.7759,15,0.374074,...,21.212132,-1.604899,-3.481638,0.396715,-15694.22,-191.23794,-44.315571,-0.715365,-1533.254237,-13924.699277


# Dask Delayed Version 1.0



The configuration for the runs and results are as follows:
- **Workers**: 4
- **Threads**: 1
- **Memory**: 6

The following observations were made when using Dask Delayed with the runner tool.
- For 4 Files, 13 tasks, the Dask runtime is **41.75s** which is **6.5s less** than the duration with Dask Bags. The runtimes for the files ranged between 11.31s and 19.74s.
- For 12 Files, 37 tasks, the Dask runtime is **316.27s** , which is **43.83s less** than the duration with Dask bags. The runtimes for the files ranged between 6.36s and 57.44s.
- For 137 Files, 412 tasks, the Dask runtime is **80m 34s** , which is **5m 5s more** than the duration with Dask bags. The runtimes for the files ranged between 6.13s and 55.89s.

### Comparision and Results Between different Workers


| Files   | Tasks   | Partitions | Workers | Threads | Memory   | Dask    | Durations   |
|---------|---------|------------|---------|---------|----------|---------|-------------|
| 137     | 686     | -          | 3       | 1       | 5.00 GiB | Delayed | 87m 59s     |
| **137** | **411** | **411**    | 3       | 1       | 6.00 GiB | Bags    | **84m 01s** |
| 137     | 411     | 822        | 3       | 1       | 6.00 GiB | Bags    | 87m 47s     |
| 137     | 411     | 1233       | 3       | 1       | 6.00 GiB | Bags    | 87m 59s     |
|         |         |            |         |         |          |         |             |
| 137     | 686     | -          | 4       | 1       | 5.00 GiB | Delayed | 76m 27s     |
| 137     | 411     | -          | 4       | 1       | 6.00 GiB | Delayed | 76m 29s     |
| 137     | 412     | 137        | 4       | 1       | 6.00 GiB | Bags    | 76m 25s     |
| 137     | 411     | 274        | 4       | 1       | 6.00 GiB | Bags    | 78m 04s     |
| **137** | **411** | **548**    | 4       | 1       | 6.00 GiB | Bags    | **75m 29s** |
|         |         |            |         |         |          |         |             |
| 137     | 686     | -          | 5       | 1       | 5.00 GiB | Delayed | 67m 37s     |
| 137     | 411     | -          | 5       | 1       | 6.00 GiB | Delayed | 71m 35s     |
| 137     | 411     | 685        | 5       | 1       | 6.00 GiB | Bags    | 76m 18s     |
| 137     | 411     | 1370       | 5       | 1       | 6.00 GiB | Bags    | 80m 14s     |


In [9]:
# For larger number of files it is recommended to use dask collections
# instead of a for loop **
# Reference:
#   https://docs.dask.org/en/latest/delayed-best-practices.html#avoid-too-many-tasks
results = []
for key in KEYS:
    data_tuple_0 = delayed(get_data)(key)
    # data_tuple_0 = delayed(data_tuple_0)
    data_tuple_1 = delayed(run_pipeline)(data_tuple_0, fix_shifts=True, verbose=False)
    # data_tuple_1 = delayed(data_tuple_1)
    result_df = delayed(run_loss_analysis)(data_tuple_1)
    results.append(result_df)

df_reports = delayed(pd.concat)(results)
# summary_table = client.compute(df_reports)
# summary_df = summary_table.result()
# summary_df

In [10]:
from time import strftime

timestamp = strftime("%Y%m%d-%H%M%S")
with performance_report(f"../results/dask_report_{timestamp}-delayed-{len(KEYS)}.html"):
    summary_table = client.compute(df_reports)
    result_df = summary_table.result()
    result_df.to_csv(f"../results/summary_report_{timestamp}- delayed-{len(KEYS)}.csv")
    display(result_df)

Unnamed: 0,key_field_0,get_data error,run_pipeline error,run_pipeline_report error,run_loss_analysis error,run_loss_analysis_report error,length,capacity,sampling,quality score,...,runtimes,degradation rate [%/yr],deg rate lower bound [%/yr],deg rate upper bound [%/yr],total energy loss [kWh],degradation energy loss [kWh],soiling energy loss [kWh],capacity change energy loss [kWh],weather energy loss [kWh],system outage loss [kWh]
0,001C4B0008A5,No error,No error,No error,No error,No error,1.082192,3.0012,15,0.918987,...,19.309776,-9.07312,-13.4601,-5.799462,-1818565.0,-342414.289053,-88330.127023,-33.15175,-885054.216869,-502732.775324
0,001C4B001069,No error,No error,No error,No error,No error,1.090411,2.87535,15,0.932161,...,19.498287,-5.903807,-11.524199,-2.543603,-1404357.0,-206345.832425,-100242.176478,-153.420931,-748116.614305,-349499.37656
0,0022F2006723,No error,No error,No error,The length of data is less than or equal to 1 ...,Loss analysis is not performed,0.613699,6.34,15,0.857143,...,14.40273,,,,,,,,,
0,0022F20139C6,No error,No error,No error,No error,No error,1.479452,6.7759,15,0.374074,...,13.091212,-1.65481,-3.672855,0.622904,-14978.4,-193.210388,-43.117523,-0.776796,-1533.198447,-13208.096041


# ~Client Futures~
> The futures API offers a work submission style that can easily emulate the map/reduce paradigm. If that is familiar to you then futures might be the simplest entrypoint into Dask.
> The other big benefit of futures is that the intermediate results, represented by futures, can be passed to new tasks without having to pull data locally from the cluster. New operations can be setup to work on the output of previous jobs that haven’t even begun yet.

From Dask Futures Documentation

Client futures performs immediate computation of functions and data submitted. this would be recommended for computing a small number of files (less than 100).
When running `client.map` on large files the `run_loss_analysis` function for some files exceeds actual computation time, observed 14-17 minutes of runtime for loss analysis of some files. This drastically affects compute time for large files.

In [36]:
# The KEYS are assigned to a worker and the worker only sticks to assigned files, unless worker stealing is enabled in the scheduler
scattered_keys = client.scatter(KEYS)
# mapping functions to the client for computating assigned KEYS to workers
dfs = client.map(get_data, scattered_keys)
df_run = client.map(run_pipeline, dfs, fix_shifts=True, verbose=False)
df_results = client.map(run_loss_analysis, df_run)
# result_dfs = client.gather(df_results)
# pd.concat(result_dfs)

In [37]:
from time import strftime

timestamp = strftime("%Y%m%d-%H%M%S")
with performance_report(f"../results/dask_report_{timestamp}-futures-{len(KEYS)}.html"):
    result_dfs = client.gather(df_results)
    result_df = pd.concat(result_dfs)
    result_df.to_csv(f"../results/summary_report_{timestamp}-futures-{len(KEYS)}.csv")
    display(result_df)

Unnamed: 0,key_field_0,get_data error,run_pipeline error,run_pipeline_report error,run_loss_analysis error,run_loss_analysis_report error,length,capacity,sampling,quality score,...,runtimes,degradation rate [%/yr],deg rate lower bound [%/yr],deg rate upper bound [%/yr],total energy loss [kWh],degradation energy loss [kWh],soiling energy loss [kWh],capacity change energy loss [kWh],weather energy loss [kWh],system outage loss [kWh]
0,001C4B0008A5,No error,No error,No error,No error,No error,1.082192,3.00120,15.0,0.918987,...,16.570406,-9.140945,-12.980815,-5.728180,-1.822164e+06,-344952.706808,-89266.826271,-223.317278,-884879.306917,-502842.306912
0,001C4B001069,No error,No error,No error,No error,No error,1.090411,2.87535,15.0,0.932161,...,16.984927,-5.903807,-11.524199,-2.543603,-1.404356e+06,-206345.889606,-100242.177508,-153.543159,-748114.816991,-349499.380152
0,0022F2006723,No error,No error,No error,The length of data is less than or equal to 1 ...,Loss analysis is not performed,0.613699,6.34000,15.0,0.857143,...,11.181610,,,,,,,,,
0,0022F20139C6,No error,No error,No error,No error,No error,1.479452,6.77590,15.0,0.374074,...,10.707604,-1.654810,-3.672855,0.622904,-1.497840e+04,-193.210388,-43.117523,-0.776796,-1533.198447,-13208.096041
0,0022F20186EC,No error,No error,No error,The length of data is less than or equal to 1 ...,Loss analysis is not performed,0.682192,5.62400,15.0,0.983936,...,5.789715,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,ZT170585000441C0276,No error,No error,No error,No error,No error,1.942466,0.27030,5.0,0.908322,...,19.352147,-18.803997,-20.596593,-15.683634,-7.485286e+02,-216.453747,-27.359649,-150.289736,-221.810568,-132.614905
0,ZT171685000441C1074,No error,No error,No error,No error,No error,1.917808,0.25240,5.0,0.930000,...,17.725450,-24.833467,-25.633263,-19.988107,-7.411235e+02,-262.670099,-22.588806,-186.742226,-176.848742,-92.273657
0,ZT171685000441C1081,No error,No error,No error,No error,No error,1.923288,0.29000,5.0,0.924501,...,20.585983,-1.897155,-3.832543,-0.381713,-3.945247e+02,-22.642252,-38.949090,-0.267123,-229.551934,-103.114331
0,ZT172785000441C1366,No error,No error,No error,No error,No error,1.676712,0.26080,5.0,0.805556,...,23.290005,-5.232922,-6.840476,-1.933811,-4.442103e+02,-40.992635,-22.700752,-0.007566,-159.638931,-220.870421


# Runtime Graph

In [None]:
import matplotlib.pyplot as plt

In [None]:
dict = {
    "Partitions": [8, 12, 20, 80, 160, 240, 320, 480, 720],
    "Duration": [720, 754, 740, 739, 700, 665, 702, 678, 753],
    "Min Runtime": [5.19, 7.84, 5.83, 5.35, 5.96, 6.75, 6.36, 6.56, 7.29],
    "Max Runtime": [51.26, 52.62, 51.39, 48.17, 46.72, 50.30, 50.12, 50.79, 50.96],
}
df = pd.DataFrame.from_dict(dict)

In [None]:
plt.scatter(df["Partitions"], df["Duration"], marker="x")
plt.title("Durations Comparisons")
plt.xlabel("Partitions")
plt.ylabel("Durations")
plt.grid(True)
plt.show()

In [None]:
plt.plot(df["Partitions"], df["Max Runtime"], marker="x")
plt.plot(df["Partitions"], df["Min Runtime"], marker="x")
plt.title("Durations Comparisons")
plt.xlabel("Partitions")
plt.ylabel("Durations")
plt.grid(True)