In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

---------------------------------

----------------------
## Environment Variables

In [2]:
import os

In [3]:
# clean out any old environments
for key in os.environ.keys():
    if key.startswith('HADRON'):
        del os.environ[key]

In [4]:
os.environ['HADRON_PM_PATH'] = './hadron/contracts'
os.environ['HADRON_PM_TYPE'] = 'json'

os.environ['HADRON_DEFAULT_PATH'] = './hadron/data'

In [5]:
# Origin
os.environ['HADRON_CLEANER_ORIGIN_SOURCE_URI'] = '../services/source/hadron_synth_origin.pq'
os.environ['HADRON_CLEANER_ORIGIN_HEADER_URI'] = '../services/source/mapping.csv'

# Other
os.environ['HADRON_CLEANER_OTHER_SOURCE_URI'] = '../services/source/hadron_synth_other.pq'
os.environ['HADRON_CLEANER_OTHER_HEADER_URI'] = '../services/source/mapping.csv'

# difference on key
os.environ['HADRON_DIFFERENCE_ON_KEY'] = 'identifier'

---------------------------------

----------------------------
## Create the Cleaner Other Component Instance

In [6]:
from ds_discovery import Transition

In [7]:
# create instance
tr = Transition.from_env('align_other_task', has_contract=False)

In [8]:
# setup using the environment variable
tr.setup_bootstrap('Telecoms', description='Aligns the headers and reinstate nulls')
tr.set_source_uri('${HADRON_CLEANER_OTHER_SOURCE_URI}')
file = tr.pm.file_pattern(name='cleaned', file_type='parquet')
tr.set_persist(file)

### add the URI to be mapped

In [9]:
# add target source
tr.add_connector_uri('header_map', '${HADRON_CLEANER_OTHER_HEADER_URI}')

In [10]:
# load the files
df = tr.load_source_canonical()

### create component intent actions

In [11]:
# run the method that calculates differences
df = tr.tools.auto_clean_header(df, rename_map='header_map')
df = tr.tools.auto_reinstate_nulls(df)
df = tr.tools.auto_to_date(df)

In [12]:
df

Unnamed: 0,bool,poly,identifier,float,date,str,cat,int
0,1,27.9024,68,5.402,2023-01-21,Bourton-on-the-Water,INACTIVE,987
1,1,21.311569,277,3.913,2022-12-09,Foxholes,ACTIVE,-14
2,1,29.222761,259,4.819,2023-02-20,Swaffham,ACTIVE,96
3,1,28.118209,286,4.703,2023-01-26,Tam,ACTIVE,806
4,1,29.493409,59,6.325,2023-02-26,Penn,ACTIVE,-643
5,1,29.193856,282,4.816,2023-02-19,Dormansland,ACTIVE,-865
6,1,30.830289,77,4.983,2023-03-27,Garner,INACTIVE,1270
7,1,27.585316,86,4.765,2023-01-14,Paggi,ACTIVE,-223
8,1,16.614564,268,3.258,2023-03-19,Emerald Ridge,INACTIVE,145
9,1,21.350724,28,3.918,2022-12-10,Deer Horn,ACTIVE,540


### run the task

In [13]:
# run the component task
tr.run_component_pipeline()

----------------------------
## Create the Cleaner Origin Component Instance

In [14]:
# create instance
tr = Transition.from_env('align_origin_task', has_contract=False)

In [15]:
# setup using the environment variable
tr.setup_bootstrap('Telecoms', description='Aligns the headers and reinstate nulls')
tr.set_source_uri('${HADRON_CLEANER_ORIGIN_SOURCE_URI}')
file = tr.pm.file_pattern(name='cleaned', file_type='parquet')
tr.set_persist(file)

### add the URI to be mapped

In [16]:
# add target source
tr.add_connector_uri('header_map', '${HADRON_CLEANER_ORIGIN_HEADER_URI}')

In [17]:
# load the files
df = tr.load_source_canonical()

### create component intent actions

In [18]:
# run the method that calculates differences
df = tr.tools.auto_clean_header(df, rename_map='header_map')
df = tr.tools.auto_reinstate_nulls(df)
df = tr.tools.auto_to_date(df)

### run the task

In [19]:
# run the component task
tr.run_component_pipeline()

---------------------------------

--------------------
## Create the Difference Component Instance

In [20]:
from ds_discovery import Wrangle

In [21]:
# create instance
wr = Wrangle.from_env('difference_task', has_contract=False)

In [22]:
# setup using the environment variable
wr.setup_bootstrap('Telecoms', description='Compare two sources and identify if there are differences')
wr.set_source_contract(Transition.from_env('align_origin_task').get_persist_contract(), template_aligned=True)

out = wr.pm.file_pattern(name='detail', prefix='hadron_difference_', file_type='csv', stamped='hours')
wr.set_persist(uri_file=out)

### add the URI to be compared with the source

In [23]:
# add target source
wr.add_connector_contract('align_other', Transition.from_env('align_other_task').get_persist_contract(), template_aligned=True)

In [24]:
# add summary connector
out = wr.pm.file_pattern(name='summary', prefix='hadron_difference_', file_type='csv', stamped='hours')
wr.add_connector_persist('summary', out)

# add detail connector
out = wr.pm.file_pattern(name='detail', prefix='hadron_difference_', file_type='json', stamped='hours')
wr.add_connector_persist('detail', out)

# add unmatched connector
out = wr.pm.file_pattern(name='unmatched', prefix='hadron_difference_', file_type='csv', stamped='hours')
wr.add_connector_persist('unmatched', out)

In [25]:
# load the files to see if there are observable differences
df = wr.load_source_canonical()
other = wr.load_canonical('align_other')

### create component intent actions

In [26]:
# run the method that calculates differences
df = wr.tools.model_difference(df, other='align_other', on_key='${HADRON_DIFFERENCE_ON_KEY}', drop_zero_sum=True, column_name='difference', 
                               summary_connector='summary', unmatched_connector='unmatched', detail_connector='detail')

### run the task

In [27]:
# run the component task
wr.run_component_pipeline()

---------------------------------

## Show The Result

In [28]:
wr.load_persist_canonical()

Unnamed: 0,identifier,int,float,cat
0,55,0,0,1
1,59,0,1,0
2,68,0,1,0
3,77,1,0,0
4,82,0,1,0
5,86,1,1,0
6,91,0,1,0


In [29]:
wr.load_canonical('detail')

Unnamed: 0,identifier,int_x,int_y,float_x,float_y,cat_x,cat_y
0,55,-,-,-,-,PENDING,WAITING
1,59,-,-,4.847,6.325,-,-
2,68,-,-,4.68,5.402,-,-
3,77,692,1270,-,-,-,-
4,82,-,-,4.01,4.428,-,-
5,86,979,-223,4.646,4.765,-,-
6,91,-,-,4.976,4.367,-,-


In [30]:
wr.load_canonical('summary')

Unnamed: 0,Attribute,Summary
0,cat,1
1,float,5
2,int,2


In [31]:
wr.load_canonical('unmatched')

Unnamed: 0,identifier,found_in
0,159,left_only
1,182,left_only
2,259,right_only
3,268,right_only
4,277,right_only
5,282,right_only
6,286,right_only
