In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

### Environment

In [2]:
import os

In [3]:
os.environ['HADRON_DEFAULT_PATH'] = 'hadron/data'

In [4]:
os.environ['HADRON_DIFF_CLEANER_OTHER'] = 'source/other_sample.csv'
os.environ['HADRON_DIFF_HEADER_MAP_OTHER'] = 'source/mapping.csv'

In [5]:
# os.environ['HADRON_DIFF_CLEANER_OTHER'] = 'mysql://user:pass@localhost:3306/mydb?table=origin'
# os.environ['HADRON_DIFF_HEADER_MAP_OTHER'] = 'mysql://user:pass@localhost:3306/mydb?table=mapping'

----------------------------
### Create the Component Instance

In [6]:
from ds_discovery import Transition

In [7]:
# create instance
tr = Transition.from_env('align_other', has_contract=False)

In [8]:
# setup using the environment variable
tr.setup_bootstrap('Telecoms', description='Aligns the headers and reinstate nulls')
tr.set_source_uri('${HADRON_DIFF_CLEANER_OTHER}')
file = tr.pm.file_pattern(name='cleaned', file_type='parquet')
tr.set_persist(file)

### Add the URI to be Mapped

In [9]:
# add target source
tr.add_connector_uri('header_map', '${HADRON_DIFF_HEADER_MAP_OTHER}')

In [10]:
# load the files
df = tr.load_source_canonical()

### Create Component Intent Actions

In [11]:
# run the method that calculates differences
df = tr.tools.auto_clean_header(df, rename_map='header_map')
df = tr.tools.auto_reinstate_nulls(df)

### Run the Task

In [12]:
# run the component task
tr.run_component_pipeline()