In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery import Transition

# Set the environment working path as the root of the Jupyter instance
os.environ['DSTU_WORK_PATH'] = Path(os.environ['PWD']).as_posix()


# Accelerated Machine learning
## Transitioning Contract

In [2]:
# create or retrieve the named Transition instance
tr = Transition('ames_dictionary')
tr.version

'v0.00'

### Data Source
if loading from local default, just pass the resource. example:<br>
`tr.set_source_contract(resource='synthetic.csv', sep=',', encoding='latin1', load=False)`

In [3]:
# set the source contract
tr.set_source_contract(resource='ames_housing_dictionary.csv', sep=',', encoding='latin1', load=False)
tr.report_source()

Unnamed: 0,param,values
0,resource,ames_housing_dictionary.csv
1,source_type,csv
2,location,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/data/0_raw
3,module_name,ds_discovery.handlers.pandas_handlers
4,handler,PandasHandler
5,modified,
6,sep,","
7,encoding,latin1


### Retrieve & Observations

In [4]:
# load the source canonical
df = tr.load_source_canonical()

In [5]:
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: Exterior2nd | Electrical | GarageType
1,description,object,0.0,0.009,113,113,Sample: Style of dwelling | Identifies the type of dwelling involved in...


In [6]:
tr.add_notes(text='The file is a dictionary of the Ames Housing attributes')

In [7]:
tr.add_notes(label='source', text='taken from the Kaggle Ames Housing desciption file')
tr.add_notes(label='source', text='URL: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data')

tr.add_notes(label='addition', text='Where there are codes, additional breakdown of field type have been included')

In [8]:
tr.report_notes(stylise=True)

Unnamed: 0,section,label,date,text
0,overview,addition,2019-06-17 08:16,"Where there are codes, additional breakdown of field type have been included"
1,,notes,2019-06-17 08:16,The file is a dictionary of the Ames Housing attributes
2,,source,2019-06-17 08:16,taken from the Kaggle Ames Housing desciption file
3,,,2019-06-17 08:16,URL: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data


------------
### Selection, Filter and Typing

In [9]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, inplace=True))

In [10]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [11]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,replace_spaces=_
1,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['', 'nan']"


### Report, Persist and Validation

In [12]:
# Create the excel data dictionary
tr.canonical_report(df, to_file=True)

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist()

# save the clean file
tr.save_clean_canonical(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0%,15.0%,113,79,Sample: MiscVal | MasVnrType | YearBuilt
1,description,object,0.0%,0.9%,113,113,Sample: Flatness of the property | Total square feet of basement area | Linear feet of street connec...


In [13]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: 3SsnPorch | FullBath | BsmtFinType2
1,description,object,0.0,0.009,113,113,Sample: 75 - 2-1/2 STORY ALL AGES | Bedrooms above grade (does NOT incl...


---------
### Re-Running the Contract Pipeline

In [14]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: SaleCondition | Condition1 | MiscFeature
1,description,object,0.0,0.009,113,113,Sample: Three season porch area in square feet | 75 - 2-1/2 STORY ALL A...
