In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

### Environment

In [2]:
import os

In [3]:
os.environ['HADRON_DIFF_ON'] = str('target')

In [4]:
os.environ['HADRON_DIFF_CLEANER_ORIGIN'] = 'source/origin_sample.csv'
os.environ['HADRON_DIFF_HEADER_MAP_ORIGIN'] = 'source/mapping.csv'
os.environ['HADRON_DIFF_CLEANER_OTHER'] = 'source/other_sample.csv'
os.environ['HADRON_DIFF_HEADER_MAP_OTHER'] = 'source/mapping.csv'

### Create the Component Instance

In [5]:
from ds_discovery import Wrangle

In [6]:
# create instance
wr = Wrangle.from_env('difference', has_contract=False)

In [7]:
# setup using the environment variable
wr.setup_bootstrap('Telecoms', description='Compare two sources and identify if there are differences')
wr.set_source_contract(Wrangle.from_env('profiling_origin').get_persist_contract(), template_aligned=True)

out = wr.pm.file_pattern(name='detail', prefix='hadron_difference_', file_type='csv', stamped='hours')
wr.set_persist(uri_file=out)

### Add the URI to the Target

In [8]:
# add target source
wr.add_connector_uri('dp_other', Wrangle.from_env('profiling_other').get_persist_uri(), template_aligned=True)

In [9]:
# add summary connector
out = wr.pm.file_pattern(name='summary', prefix='hadron_difference_', file_type='csv', stamped='hours')
wr.add_connector_persist('summary', out)

In [None]:
# load the files to see if there are observable differences
df = wr.load_source_canonical()
other = wr.load_canonical('dp_other')

### Create Component Intent Actions

In [11]:
# run the method that calculates differences
df = wr.tools.model_difference(df, 'dp_other', on_key='${HADRON_DIFF_ON}', drop_no_diff=True, ordered=True, column_name='difference')

### Run the Task

In [13]:
# run the component task
wr.run_component_pipeline()

In [14]:
wr.load_persist_canonical()

Unnamed: 0,target,C1,D1,E1
0,A,1,0,0
1,B,0,0,1
2,D,1,1,0
3,G,1,1,1
