In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline

---------------------------

## Environment
environment variables for dynamic spesification of dataset size in rows and its location and name as a URI

In [3]:
import os

In [4]:
# clean out any old environments
for key in os.environ.keys():
    if key.startswith('HADRON'):
        del os.environ[key]

In [5]:
# domain contracts location
os.environ['HADRON_PM_PATH'] = './hadron/contracts'
os.environ['HADRON_PM_TYPE'] = 'json'

# data petsist location
os.environ['HADRON_DEFAULT_PATH'] = './hadron/data'

# synthetic capability values
os.environ['HADRON_SYNTHETIC_PERSIST_RESOURCE'] = 'hadron_synthetic_builder.parquet'
os.environ['HADRON_SYNTHETIC_DATA_SIZE'] = '2000'

----------------------
## Synthetic Build
The component capability that builds the synthetic dataset

In [6]:
from ds_discovery import SyntheticBuilder

### create capability instance

In [7]:
sb = SyntheticBuilder.from_env('capability_task', has_contract=False)

In [8]:
sb.set_persist('${HADRON_SYNTHETIC_PERSIST_RESOURCE}')

### synthetic intent actions (tools)

In [9]:
# create a sample size for the data rows and a DataFrame to play with
size = 1000
df = pd.DataFrame()

**To show the synthetic tools methods run `sb.tools.__dir__()`** (for reference only and can be deleted)
* `get_*`       methods that return a list of created items (size must be provided)
* `correlate_*` methods that create a list of modified items
* `model_*`     methods that return a modification DataFrame

In [10]:
# using comprehension to show 'get_' methods
[i for i in sb.tools.__dir__() if i.startswith('get_')]

['get_category',
 'get_datetime',
 'get_dist_bernoulli',
 'get_dist_bounded_normal',
 'get_dist_choice',
 'get_dist_normal',
 'get_distribution',
 'get_intervals',
 'get_number',
 'get_sample',
 'get_selection',
 'get_string_pattern',
 'get_tagged_pattern',
 'get_uuid']

**an example of a basic synthetic data build** (for reference only, delete before building)

In [11]:
df['cat'] = sb.tools.get_category(selection=['M', 'F', 'U'], relative_freq=[6,3,1], size=size, column_name='cat')
df['num'] = sb.tools.get_number(from_value=-1.0, to_value=1.0, size=size, column_name='num')
df['int'] = sb.tools.get_number(from_value=100, to_value=999, size=size, column_name='int')
df['bool'] = sb.tools.get_dist_bernoulli(0.5, size=size, column_name='bool')
df['date']  = sb.tools.get_datetime(start='2022-12-01', until='2023-03-31', date_format='%Y-%m-%d', size=size, column_name='date')
df['object'] = sb.tools.get_string_pattern('(ddd)sddd-ddd', size=size, column_name='object')

In [12]:
# add the synthetic tools methods


### run the intent actions pipline

In [13]:
sb.run_component_pipeline(1_000)

-----------------
## Controller
The master capability that orchestrates the component capabilities into a component service

In [14]:
from ds_discovery import Controller

### create controller instance

In [15]:
controller = Controller.from_env(has_contract=False)

### register capabilities

In [16]:
controller.intent_model.synthetic_builder(canonical=0, task_name='capability_task', intent_level='synthetic_capability')

### run the service

In [17]:
controller.run_controller()

----------------------
## Validate the Build Outcome

In [18]:
sb = SyntheticBuilder.from_env('capability_task')

In [19]:
df = sb.load_persist_canonical()
sb.canonical_report(df)

Unnamed: 0,Attributes (6),dType,%_Null,%_Dom,Count,Unique,Observations
0,bool,int64,0.0%,51.6%,1000,2,max=1 | min=0 | mean=0.52 | dominant=1
1,cat,object,0.0%,60.5%,1000,3,Sample: M | F | U
2,date,object,0.0%,1.8%,1000,120,Sample: 2023-03-21 | 2022-12-17 | 2022-12-04 | 2023-02-27 | 2023-03-30
3,int,int64,0.0%,0.6%,1000,599,max=998 | min=100 | mean=534.75 | dominant=609
4,num,float64,0.0%,0.4%,1000,784,"max=0.997 | min=-0.998 | mean=0.02 | dominant=[-0.178, 0.183]"
5,object,object,0.0%,0.1%,1000,1000,Sample: (592) 939-323 | (941) 870-385 | (305) 024-067 | (020) 935-389 | (885) 737-871
