In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
import flowcept.analytics as analytics
import flowcept.analytics.plot as flow_plot
from flowcept import TaskQueryAPI

In [None]:
def ingest_mock_data():
    """
    This function is here just to enable the execution of the functions we are showing in this notebook.
    """
    import json
    from uuid import uuid4
    from flowcept import Flowcept
    test_data_path = '../tests/api/sample_data_with_telemetry_and_rai.json' # This sample data contains a workflow composed of 9 tasks.
    with open(test_data_path) as f:
        base_data = json.loads(f.read())
    
    docs = []
    wf_id = str(uuid4())
    for d in base_data:
        new_doc = d.copy()
        new_doc.pop("_id")
        new_doc["task_id"] = str(uuid4())
        new_doc["workflow_id"] = wf_id
        new_doc.pop("timestamp", None)
        docs.append(new_doc)
    
    inserted_ids = Flowcept.db._dao().insert_and_update_many_tasks(docs, "task_id")
    #assert len(inserted_ids) == len(base_data)
    return wf_id

In [None]:
# Need to run only if this is the first time.
wf_id = ingest_mock_data()
wf_id

In [None]:
#wf_id = '100faab4-ff4c-4f78-92a7-6f20ec1fad83'

In [None]:
query_api = TaskQueryAPI()

## Very Simple query returning a DataFrame

In [None]:
_filter = {
    "workflow_id": wf_id
}
df = query_api.df_query(_filter, calculate_telemetry_diff=True)

In [None]:
df.head(3)

## Cleaning DataFrame

In [None]:
cleaned_df = analytics.clean_dataframe(
                        df,
                        keep_non_numeric_columns=False,
                        keep_only_nans_columns=False,
                        keep_task_id=False,
                        keep_telemetry_percent_columns=False,
                        sum_lists=True,
                        aggregate_telemetry=True)
cleaned_df.head()

In [None]:
sort = [
    ("generated.loss", TaskQueryAPI.ASC),
    ("generated.responsible_ai_metadata.params", TaskQueryAPI.ASC),
]
df = query_api.df_get_top_k_tasks(
    filter=_filter,
    calculate_telemetry_diff=False,
    sort=sort,
    k=3,
)
df.filter(regex='used[.]|generated[.]')

## Query Returning the Top K tasks using quantile thresholds

This query filters values based on quantiles (list only ocurrences with cpu_times < 50% quantile, i.e., median) then sort by cpu, loss, and flops.

In [None]:
clauses = [
    ("telemetry_diff.process.cpu_times.user", "<", 0.5),
]
sort = [
    ("telemetry_diff.process.cpu_times.user", TaskQueryAPI.ASC),
    ("generated.loss", TaskQueryAPI.ASC),
    ("generated.responsible_ai_metadata.flops", TaskQueryAPI.ASC),
]
df = query_api.df_get_tasks_quantiles(
    clauses=clauses,
    filter=_filter,
    sort=sort,
    calculate_telemetry_diff=True,
    clean_dataframe=True,
)
df

## Correlation Analysis

#### Using Pandas' correlation 

In [None]:
df.corr()

#### Using FlowCept's functions for correlations

In [None]:
df = query_api.df_query(_filter, calculate_telemetry_diff=True)
df = analytics.clean_dataframe(df, aggregate_telemetry=True, sum_lists=True)

##### All correlations

In [None]:
analytics.analyze_correlations(df)

##### Only correlations >= 0.9 (absolute) and using a different method

In [None]:
analytics.analyze_correlations(df, method='spearman', threshold=0.9)

In [None]:
analytics.analyze_correlations_between(df, col_pattern1="generated.", col_pattern2="used.", threshold=0.5)

In [None]:
analytics.analyze_correlations_used_vs_generated(df, threshold=0.8)

In [None]:
analytics.analyze_correlations_used_vs_telemetry_diff(df, threshold=0.8)

In [None]:
analytics.analyze_correlations_generated_vs_telemetry_diff(df, threshold=0.8)

In [None]:
analytics.describe_col(df, col='generated.loss')

In [None]:
analytics.describe_cols(df, cols=['generated.loss','generated.responsible_ai_metadata.params'], col_labels=['Loss', '#Params'])

## Plots

In [None]:
_filter = {
    "workflow_id": wf_id
}
df = query_api.df_query(_filter, calculate_telemetry_diff=True, clean_dataframe=True, sum_lists=True, aggregate_telemetry=True)

In [None]:
flow_plot.heatmap(df)

## Plotting relevant 'candidates' and comparing it with the `query_api.df_get_tasks_quantiles` function. 

In [None]:
df.to_csv('sample_data.csv')

In [None]:
x_col = 'generated.loss'
y_col = 'telemetry_diff.cpu.times_avg.user'
color_col = 'generated.responsible_ai_metadata.params'
flow_plot.scatter2d_with_colors(df,
                                x_col='generated.loss',
                                y_col='telemetry_diff.cpu.times_avg.user',
                                color_col='generated.responsible_ai_metadata.params',
                                x_label='Loss',
                                y_label='User CPU',                                
                                color_label='#Params',
                                xaxis_title='Loss',
                                yaxis_title='User CPU',
                                plot_horizon_line=True,
                                horizon_quantile=0.5,
                                plot_pareto=False)

In [None]:
clauses = [
    (y_col, "<=", 0.5),
]
sort = [
    (y_col, TaskQueryAPI.ASC),
    (x_col, TaskQueryAPI.ASC),
    (color_col, TaskQueryAPI.ASC),
]
df = query_api.df_get_tasks_quantiles(
    clauses=clauses,
    filter=_filter,
    sort=sort,
    calculate_telemetry_diff=True,
)
df[['task_id', x_col, y_col, color_col]]

### Show everything we captured about that 'good' candidate, highlighted in the pareto front blue dot in the plot above.

In [None]:
df.query(f"task_id == '{df.head(1)['task_id'].values[0]}'") 

### Find Interesting Tasks with data that are sensitve according to correlations

In [None]:
result = query_api.find_interesting_tasks_based_on_correlations_generated_and_telemetry_data(filter=_filter)

In [None]:
result.items()

In [None]:
# task_id, res = next(iter(result.items()))
# res

### Finding Tasks with Outlier Data

In [None]:
tasks_with_outliers = query_api.df_find_outliers(
    outlier_threshold=5,
    calculate_telemetry_diff=True,
    filter=_filter,
    clean_dataframe=True,
    keep_task_id=True
)

In [None]:
selected_columns = set(tasks_with_outliers['outlier_columns'].explode())
selected_columns.add("task_id")
selected_columns.add("outlier_columns")
result_df = tasks_with_outliers.loc[:, tasks_with_outliers.columns.isin(selected_columns)]
result_df