In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery import Transition
os.environ['DSTU_WORK_PATH'] = Path(os.environ['PWD']).as_posix()

import ds_discovery
print('DTU: {}'.format(ds_discovery.__version__))

DTU: 1.09.020


# Accelerated Machine learning
## Transitioning Contract

In [13]:
# create or retrieve the named Transition instance
tr = Transition('synthetic')
tr.version

'v0.00'

### Data Source
if loading from local default, just pass the resource. example:<br>
`tr.set_source_contract(resource='synthetic.csv', sep=',', encoding='latin1', load=False)`

In [19]:
# set the source contract
tr.set_source_contract(resource='', source_type='', location='', module_name='', handler='', load=False)
tr.report_source()

### Retrieve & Observations

In [59]:
# load the source canonical
df = tr.load_source_canonical()

In [8]:
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,% Nulls,Count,Unique,Observations


In [61]:
tr.add_notes(text='The file is a synthetic data file created for this demonstration')

In [62]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.add_notes(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [63]:
tr.add_attribute_notes(attribute='null', text="Here for demo of removal of nulls")
tr.add_attribute_notes(attribute='weight_cat', text="Demonstration of removal of columns with predominant values")
tr.add_attribute_notes(attribute='weight_cat', text="the value 'A' is over 95% predominant")
tr.add_attribute_notes(attribute='start', text="changing this to start_date so it being a date is obvious")

In [9]:
tr.report_notes(stylise=True)

Unnamed: 0,label,date,text


------------
### Selection, Filter and Typing

In [65]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [66]:
# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90,inplace=True, nulls_list=['']))

In [67]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [68]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [69]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', inplace=True))

In [70]:
# Type boolean
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [71]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

In [72]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [10]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters


### Report, Persist and Validation

In [7]:
# Create the excel data dictionary
tr.canonical_report(df, to_file=True)

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist()

# save the clean file
tr.save_clean_canonical(df)

Unnamed: 0,Attribute,dType,% Nulls,Count,Unique,Observations


In [6]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,% Nulls,Count,Unique,Observations


---------
### Re-Running the Contract Pipeline

In [5]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,% Nulls,Count,Unique,Observations
