In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

%config Completer.use_jedi = False

### Environment

In [2]:
import os

In [3]:
os.environ['HADRON_DEFAULT_PATH'] = 'hadron/data'

### Create the Component Instance

In [5]:
from ds_discovery import Wrangle, Transition

In [6]:
# create instance
wr = Wrangle.from_env('profiling_origin', has_contract=False)

In [7]:
# setup using the environment variable
wr.setup_bootstrap('Telecoms', description='Produce data profiling for validation and insight')
wr.set_source_contract(Transition.from_env('align_origin').get_persist_contract(), template_aligned=True)
file = wr.pm.file_pattern(name='persist', file_type='parquet')
wr.set_persist(file)

### Add the URI to the Target

In [8]:
# add quality
quality = wr.pm.file_pattern(name='quality', prefix='hadron_profiling_origin_', file_type='csv', stamped='hours')
wr.add_connector_persist('quality', quality)
# add quality
dictionary = wr.pm.file_pattern(name='dictionary', prefix='hadron_profiling_origin_', file_type='csv', stamped='hours')
wr.add_connector_persist('dictionary', dictionary)
# add quality
schema = wr.pm.file_pattern(name='schema', prefix='hadron_profiling_origin_', file_type='csv', stamped='hours')
wr.add_connector_persist('schema', schema)

In [9]:
# load the files to see if there are observable differences
df = wr.load_source_canonical()

### Create Component Intent Actions

In [10]:
df = wr.tools.model_profiling(df, profiling='quality', connector_name='quality', column_name='quality')
df = wr.tools.model_profiling(df, profiling='dictionary', connector_name='dictionary', column_name='dictionary')
df = wr.tools.model_profiling(df, profiling='schema', connector_name='schema', column_name='schema')

### Run the Task

In [11]:
# run the component task
wr.run_component_pipeline()