In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# matpolitlib config
%matplotlib inline
plt.style.use('ggplot')

# Pandas setup
pd.set_option('max_colwidth', 200)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 99)
pd.set_option('expand_frame_repr', True)

%config Completer.use_jedi = False

### Environment

In [2]:
import os

In [3]:
os.environ['HADRON_PM_PATH'] = './hadron/contracts'
os.environ['HADRON_PM_TYPE'] = 'json'

os.environ['HADRON_DEFAULT_PATH'] = './hadron/data'

In [4]:
# data profiling source
os.environ['HADRON_PROFILING_SOURCE_URI'] = '../services/source/hadron_synth_origin.pq'

--------------------------

### Create the Component Instance

In [5]:
from ds_discovery import Wrangle, Transition

In [6]:
# create instance
wr = Wrangle.from_env('profiling_task', has_contract=False)

In [7]:
# setup using the environment variable
wr.setup_bootstrap('Telecoms', description='Produce data profiling for validation and insight')
wr.set_source_uri('${HADRON_PROFILING_SOURCE_URI}')
file = wr.pm.file_pattern(name='profiled', file_type='parquet')
wr.set_persist(file)

### Add the URI to the Target

In [8]:
# add quality
quality = wr.pm.file_pattern(name='quality', prefix='hadron_profiling_origin_', file_type='json', stamped='hours')
wr.add_connector_persist('quality', quality)
# add quality
dictionary = wr.pm.file_pattern(name='dictionary', prefix='hadron_profiling_origin_', file_type='json', stamped='hours')
wr.add_connector_persist('dictionary', dictionary)
# add quality
schema = wr.pm.file_pattern(name='schema', prefix='hadron_profiling_origin_', file_type='json', stamped='hours')
wr.add_connector_persist('schema', schema)

In [9]:
# load the files to see if there are observable differences
df = wr.load_source_canonical()

### Create Component Intent Actions

In [10]:
df = wr.tools.model_profiling(df, profiling='quality', connector_name='quality', column_name='quality')
df = wr.tools.model_profiling(df, profiling='dictionary', connector_name='dictionary', column_name='dictionary')
df = wr.tools.model_profiling(df, profiling='schema', connector_name='schema', column_name='schema')

### Run the Task

In [11]:
# run the component task
wr.run_component_pipeline()

### Show the Results

In [12]:
wr.load_canonical('quality')

Unnamed: 0,sections,elements,summary
0,timestamp,readable,08 May 2023 05:11 PM
1,timestamp,semantic,2023-05-08 17:11:35
2,score,quality_avg,95%
3,score,usability_avg,90%
4,data_shape,rows,12
5,data_shape,columns,10
6,data_shape,memory,38KB
7,data_type,numeric,0
8,data_type,category,7
9,data_type,datetime,1


In [13]:
wr.load_canonical('dictionary')

Unnamed: 0,Attributes (10),dType,%_Null,%_Dom,%_Nxt,Count,Unique,Observations
0,bernoulli,bool,0.0,0.75,0.25,12,2,True | False
1,bool,bool,0.0,1.0,0.0,12,1,True
2,cat,category,0.0,0.75,0.167,12,3,Sample: ACTIVE | INACTIVE | PENDING
3,date,datetime64[ns],0.0,0.083,0.083,12,12,max=2023-03-27 00:00:00 | min=2022-12-09 00:00:00 | yr mean= 2023
4,gumbel,float64,0.0,0.083,0.083,12,12,"max=0.266 | min=-0.125 | mean=0.05 | dominant=[-0.125, -0.115]"
5,int,int64,0.0,0.083,0.083,12,12,"max=987 | min=-849 | mean=179.08 | dominant=[-849, -724]"
6,normal,float64,0.0,0.083,0.083,12,12,"max=2.197 | min=-1.17 | mean=0.22 | dominant=[-1.17, -1.0]"
7,num,float64,0.0,0.083,0.083,12,12,"max=4.983 | min=2.058 | mean=4.27 | dominant=[2.058, 3.39]"
8,str,string,0.0,0.083,0.083,12,12,Sample: Claire Jean | Bourton-on-the-Water | Swaffham | Normandia | Paggi
9,unique,int64,0.0,0.083,0.083,12,12,"max=182 | min=14 | mean=78.92 | dominant=[14, 28]"


In [15]:
wr.load_canonical('schema').iloc[:20]

Unnamed: 0,root,section,element,value
0,bernoulli,intent,categories,"[0, 1]"
1,bernoulli,intent,dtype,category
2,bernoulli,params,freq_precision,2
3,bernoulli,patterns,relative_freq,"[75.0, 25.0]"
4,bernoulli,patterns,sample_distribution,"[9, 3]"
5,bernoulli,stats,category_count,2
6,bernoulli,stats,highest_unique,75.0
7,bernoulli,stats,lowest_unique,25.0
8,bernoulli,stats,nulls_percent,0.0
9,bernoulli,stats,sample_size,12
