In [17]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery.transition.transitioning import TransitionAgent as Transition

# Set the environment path as the root of the Jupyter instance
os.environ['DTU_CONTRACT_PATH'] = Path(os.environ['PWD'], 'contracts').as_posix()
os.environ['DTU_ORIGIN_PATH'] = Path(os.environ['PWD'], 'data', '0_raw').as_posix()
os.environ['DTU_PERSIST_PATH'] = Path(os.environ['PWD'], 'data', '1_transition').as_posix()


# Accelerated Machine learning
## Transitioning Contract

In [18]:
# create or retrieve the named Transition instance
tr = Transition.from_env('ames_dictionary')

### Data Source
if loading from local default, just pass the resource. example:<br>
`tr.set_source_contract(resource='ames_housing_dictionary.csv', sep=',', encoding='latin1', load=False)`

or a full handler

`tr.set_source_contract(resource='data/synthetic/ames_housing_dictionary.csv', connector_type='dsv',
                       location='ds-discovery', module_name='ds_connectors.handlers.aws_s3_handlers', handler='S3SourceHandler',
                       delimiter=',', encoding='latin1')`


In [19]:
# set the source contract
tr.set_source_contract(resource='ames_housing_dictionary.csv', sep=',', encoding='latin1')
tr.set_persist_contract()

### Retrieve & Observations

In [20]:
# load the source canonical
df = tr.load_source_canonical()

In [21]:
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: BsmtHalfBath | BsmtFinType2 | BsmtFinSF1
1,description,object,0.0,0.009,113,113,Sample: Proximity to various conditions (if more than one is present) |...


In [22]:
tr.add_notes(text='The file is a dictionary of the Ames Housing attributes')

In [23]:
tr.add_notes(label='source', text='taken from the Kaggle Ames Housing desciption file')
tr.add_notes(label='source', text='URL: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data')

tr.add_notes(label='addition', text='Where there are codes, additional breakdown of field type have been included')

In [24]:
tr.report_notes(stylise=True)

Unnamed: 0,section,label,date,text
0,notes,addition,2019-09-19 14:37,"Where there are codes, additional breakdown of field type have been included"
1,,comment,2019-09-19 14:37,The file is a dictionary of the Ames Housing attributes
2,,source,2019-09-19 14:37,taken from the Kaggle Ames Housing desciption file
3,,,2019-09-19 14:37,URL: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data


------------
### Selection, Filter and Typing

In [25]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, inplace=True))

In [26]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [27]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,replace_spaces=_
1,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['', 'nan']"


### Report, Persist and Validation

In [28]:
# Create the excel data dictionary
tr.discover.data_dictionary(condition=)

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist()

# save the clean file
tr.save_clean_canonical(df)

TypeError: canonical_report() got an unexpected keyword argument 'to_file'

In [13]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: 3SsnPorch | FullBath | BsmtFinType2
1,description,object,0.0,0.009,113,113,Sample: 75 - 2-1/2 STORY ALL AGES | Bedrooms above grade (does NOT incl...


---------
### Re-Running the Contract Pipeline

In [14]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: SaleCondition | Condition1 | MiscFeature
1,description,object,0.0,0.009,113,113,Sample: Three season porch area in square feet | 75 - 2-1/2 STORY ALL A...
