In [None]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery import Transition

# Set where to persist
# NOTE: if running on windows, PWD doesn't exist so you need to manually set it
# os.environ['PWD'] = Path('<<Your_Project_path>>').as_posix()
os.environ['DSTU_WORK_PATH'] = Path(os.environ['PWD']).as_posix()

import ds_discovery
print('DTU: {}'.format(ds_discovery.__version__))

# Accelerated Machine learning
## Transitioning Contract

In [None]:
# create or retrieve the named Transition instance
tr = Transition('synthetic')

#### Reset the Contract

In [None]:
tr.reset_transition_contracts()

### Data Source
if loading from local default, just pass the resource. example:<br>
`tr.set_source_contract(resource='synthetic.csv', sep=',', encoding='latin1', load=False)`

In [None]:
# set the source contract
# tr.set_source_contract(resource='', source_type='', location='', module_name='', handler='')

tr.set_source_contract(resource='synthetic_customer.csv', sep=',', encoding='latin1')
tr.report_source()

### Retrieve & Observations

In [None]:
# load the source canonical
df = tr.load_source_canonical()

In [None]:
tr.canonical_report(df, stylise=True)

In [None]:
tr.add_notes(text='The file is a synthetic data file created for this demonstration')

In [None]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')

In [None]:
tr.report_notes(stylise=True)

------------
### Selection, Filter and Typing

In [None]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [None]:
# dynamically capture what will be removed through the predominant filter
report = tr.canonical_report(df, stylise=False, report_header='%_Dom', condition=">0.90")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >90% predominance removes {}'.format(report))

# dynamically capture what will be removed through the nulls filter
report = tr.canonical_report(df, stylise=False, report_header='%_Null', condition=">0.99")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >99% nulls removes {}'.format(report))

# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90, inplace=True, nulls_list=['']))

In [None]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [None]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [None]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', inplace=True))

In [None]:
# Type boolean
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [None]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

In [None]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [None]:
# Cleaners report
tr.report_cleaners()

### Bulk upload the Attribute Dictionary

In [None]:
# load the Transitioning instance
tr_dict = Transition('synthetic_customer_dictionary')

In [None]:
# get the Transitioned Canonical
df_ames_dict = tr_dict.load_clean_canonical()
tr.canonical_report(df_ames_dict, stylise=False)

In [None]:
# bulk upload the dictionary into our notes
tr.upload_notes(df_ames_dict, label_header='attribute', text_header='description', note_type='dictionary', selection=df.columns.to_list())

In [None]:
tr.report_notes(stylise=True)

### Report, Persist and Validation

In [None]:
# Create the excel data dictionary
tr.canonical_report(df, to_file=True)

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist()

# save the clean file
tr.save_clean_canonical(df)

In [None]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=False)

---------
### Re-Running the Contract Pipeline

In [None]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df, stylise=False)