Darryl Oatridge, April 2023

### Environment

In [1]:
import os

In [2]:
# clean out any old environments
for key in os.environ.keys():
    if key.startswith('HADRON'):
        del os.environ[key]

In [3]:
os.environ['HADRON_PM_PATH'] = './hadron/contracts'
os.environ['HADRON_PM_TYPE'] = 'json'

os.environ['HADRON_DEFAULT_PATH'] = './hadron/data'

In [4]:
# Origin difference
os.environ['HADRON_CLEANER_ORIGIN_SOURCE_URI'] = '../services/source/hadron_synth_origin.pq'
os.environ['HADRON_CLEANER_ORIGIN_HEADER_URI'] = '../services/source/mapping.csv'

# Other difference
os.environ['HADRON_CLEANER_OTHER_SOURCE_URI'] = '../services/source/hadron_synth_other.pq'
os.environ['HADRON_CLEANER_OTHER_HEADER_URI'] = '../services/source/mapping.csv'

# difference on key
os.environ['HADRON_DIFFERENCE_ON_KEY'] = 'identifier'

## Controller

In [5]:
from ds_discovery import Controller

In [6]:
controller = Controller.from_env(has_contract=False)

### Add Components

In [7]:
controller.intent_model.transition(canonical=0, task_name='align_origin_task', intent_level='cleaning_origin')
controller.intent_model.transition(canonical=0, task_name='align_other_task', intent_level='cleaning_other')

In [8]:
controller.intent_model.wrangle(canonical=0, task_name='difference_task', intent_level='difference_report')

In [9]:
controller.intent_model.wrangle(canonical=0, task_name='profiling_task', intent_level='data_profiling')

### Report

In [10]:
controller.report_tasks()

Unnamed: 0,level,order,component,task,parameters,creator
0,cleaning_origin,0,Transition,'align_origin_task',[],doatridge
1,cleaning_other,0,Transition,'align_other_task',[],doatridge
2,data_profiling,0,Wrangle,'profiling_task',[],doatridge
3,difference_report,0,Wrangle,'difference_task',[],doatridge


In [11]:
run_book = [
    controller.runbook2dict(task='cleaning_other', persist=True, end_source=True),
    controller.runbook2dict(task='cleaning_origin', source='@'),
    controller.runbook2dict(task='difference_report'),
]
controller.add_run_book(run_levels=run_book)

In [12]:
run_book = [
    controller.runbook2dict(task='data_profiling'),
]
controller.add_run_book(run_levels=run_book, book_name='profiling')

In [13]:
controller.report_run_book()

Unnamed: 0,name,task,persist,end_source,source
0,primary_run_book,cleaning_other,True,True,
1,,cleaning_origin,,,@
2,,difference_report,,,
3,profiling,data_profiling,,,


### Run Controller Pipeline 

In [14]:
controller.run_controller(run_cycle_report='hadron_controller_diff_report.csv')
controller.load_canonical(connector_name='run_cycle_report')

Unnamed: 0,time,text
0,2023-05-08 17:56:18.246726,start run-cycle 0
1,2023-05-08 17:56:18.252798,start task cycle 0
2,2023-05-08 17:56:18.257574,running cleaning_other
3,2023-05-08 17:56:18.606587,"canonical shape is (15, 8)"
4,2023-05-08 17:56:18.608000,running cleaning_origin
5,2023-05-08 17:56:18.854499,"canonical shape is (12, 10)"
6,2023-05-08 17:56:18.858698,running difference_report
7,2023-05-08 17:56:18.935273,"canonical shape is (7, 4)"
8,2023-05-08 17:56:18.936782,tasks complete
9,2023-05-08 17:56:18.938245,end of report


#### data profiling

In [15]:
# data profiling source
os.environ['HADRON_PROFILING_SOURCE_URI'] = '../services/source/hadron_synth_origin.pq'

In [16]:
controller.run_controller(run_book='profiling', run_cycle_report='hadron_controller_profiling_report.csv')
controller.load_canonical(connector_name='run_cycle_report')

Unnamed: 0,time,text
0,2023-05-08 17:56:18.973933,start run-cycle 0
1,2023-05-08 17:56:18.975318,start task cycle 0
2,2023-05-08 17:56:18.977801,running data_profiling
3,2023-05-08 17:56:19.957892,"canonical shape is (12, 10)"
4,2023-05-08 17:56:19.959574,tasks complete
5,2023-05-08 17:56:19.960931,end of report


In [17]:
# data profiling source
os.environ['HADRON_PROFILING_SOURCE_URI'] = '../services/source/hadron_synth_other.pq'

In [18]:
controller.run_controller(run_book='profiling', run_cycle_report='hadron_controller_profiling_report.csv')
controller.load_canonical(connector_name='run_cycle_report')

Unnamed: 0,time,text
0,2023-05-08 17:56:19.995034,start run-cycle 0
1,2023-05-08 17:56:19.996700,start task cycle 0
2,2023-05-08 17:56:19.998758,running data_profiling
3,2023-05-08 17:56:20.953589,"canonical shape is (15, 8)"
4,2023-05-08 17:56:20.956386,tasks complete
5,2023-05-08 17:56:20.958115,end of report
