In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# core libraries
import pandas as pd
import os
from pathlib import Path

%reload_ext autoreload
%autoreload 2
# for cleaning and discovery
from ds_discovery import Transition

# Set the environment working path as the root of the Jupyter instance
os.environ['DSTU_WORK_PATH'] = Path(os.environ['PWD']).as_posix()


ImportError: cannot import name 'Transition'

# Accelerated Machine learning
## Transitioning Contract

In [2]:
# create or retrieve the named Transition instance
tr = Transition('ames')

#### Reset the Contract

In [3]:
tr.reset_transition_contracts()

### Data Source
if loading from local default, just pass the resource. example:<br>
`tr.set_source_contract(resource='synthetic.csv', sep=',', encoding='latin1', load=False)`

In [4]:
# set the source contract
tr.set_source_contract(resource='ames_housing.csv', sep=',', encoding='latin1', load=False)
tr.report_source()

Unnamed: 0,param,values
0,resource,ames_housing.csv
1,source_type,csv
2,location,/Users/doatridge/code/projects/prod/discovery-transitioning-utils/jupyter/working/data/0_raw
3,module_name,ds_discovery.handlers.pandas_handlers
4,handler,PandasHandler
5,modified,
6,sep,","
7,encoding,latin1


### Load Source Canonical
load the source canonical and add information on the source.

In [5]:
# load the source canonical
df = tr.load_source_canonical()

In [6]:
tr.add_notes(text='Ames housing data to predict sales prices and practice feature engineering, RFs, and gradient boosting')

In [7]:
tr.add_notes(label='source', text='Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques')

### Create report
Creating the report and include the **_Next Dominant_** element column. Allows us to assess the true dominance of the primary element.

In [10]:
tr.canonical_report(df, inc_next_dom=True, report_header='', condition='')

Unnamed: 0,Attribute,dType,%_Null,%_Dom,%_Nxt,Count,Unique,Observations
0,1stFlrSF,int64,0.0%,1.7%,1.1%,1460,753,max=4692 | min=334 | mean=1162.63
1,2ndFlrSF,int64,0.0%,56.8%,0.7%,1460,417,max=2065 | min=0 | mean=346.99
2,3SsnPorch,int64,0.0%,98.4%,0.2%,1460,20,max=508 | min=0 | mean=3.41
3,Alley,object,93.8%,54.9%,45.1%,91,2,Sample: Pave | Grvl
4,BedroomAbvGr,int64,0.0%,55.1%,24.5%,1460,8,max=8 | min=0 | mean=2.87
5,BldgType,object,0.0%,83.6%,7.8%,1460,5,Sample: 1Fam | TwnhsE | 2fmCon
6,BsmtCond,object,2.5%,92.1%,4.6%,1423,4,Sample: TA | Fa | Po
7,BsmtExposure,object,2.6%,67.0%,15.5%,1422,4,Sample: Gd | No | Mn
8,BsmtFinSF1,int64,0.0%,32.0%,0.8%,1460,637,max=5644 | min=0 | mean=443.64
9,BsmtFinSF2,int64,0.0%,88.6%,0.3%,1460,144,max=1474 | min=0 | mean=46.55


### Bulk upload the Ames Attribute Dictionary

In [9]:
# load the Transitioning instance
tr_dict = Transition('ames_dictionary')

In [10]:
# get the Transitioned Canonical
df_ames_dict = tr_dict.load_clean_canonical()
tr.canonical_report(df_ames_dict, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,attribute,object,0.0,0.15,113,79,Sample: LandContour | MSZoning | OverallCond
1,description,object,0.0,0.009,113,113,Sample: I - Industrial | Low quality finished square feet (all floors...


In [11]:
tr_dict.report_notes()

Unnamed: 0,section,label,date,text
0,overview,addition,2019-06-17 08:16,"Where there are codes, additional breakdown of field type have been included"
1,,notes,2019-06-17 08:16,The file is a dictionary of the Ames Housing attributes
2,,source,2019-06-17 08:16,taken from the Kaggle Ames Housing desciption file
3,,,2019-06-17 08:16,URL: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data


In [12]:
# load it into dictionary passing the fields we currently have as filter
tr.upload_notes(df_ames_dict, label_header='attribute', text_header='description', note_type='dictionary', selection=df.columns.to_list())

In [13]:
tr.report_notes(stylise=True, drop_dates=True)

Unnamed: 0,section,label,text
0,overview,notes,"Ames housing data to predict sales prices and practice feature engineering, RFs, and gradient boosting"
1,,source,Kaggle: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
2,dictionary,1stFlrSF,First Floor square feet
3,,2ndFlrSF,Second floor square feet
4,,3SsnPorch,Three season porch area in square feet
5,,Alley,Type of alley access to property
6,,BldgType,Type of dwelling
7,,BsmtCond,Evaluates the general condition of the basement
8,,BsmtExposure,Refers to walkout or garden level walls
9,,BsmtFinSF1,Type 1 finished square feet


### Select Augmented Knowledge
notes report can filter on section and label. This allows information on single attributes to be viewed

In [14]:
tr.report_notes(labels='SaleType', drop_dates=True)

Unnamed: 0,section,label,text
0,dictionary,SaleType,Type of sale
1,,,WD - Warranty Deed - Conventional
2,,,CWD - Warranty Deed - Cash
3,,,VWD - Warranty Deed - VA Loan
4,,,New - Home just constructed and sold
5,,,COD - Court Officer Deed/Estate
6,,,Con - Contract 15% Down payment regular terms
7,,,ConLw - Contract Low Down payment and low interest
8,,,ConLI - Contract Low Interest
9,,,ConLD - Contract Low Down


------------
### Selection, Filter and Typing

In [15]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, inplace=True))

#### Add Notes On Auto Remove
Adding notes dynamically to identify predominant columns that are removed

In [16]:
# dynamically capture what will be removed through the predominant filter
report = tr.canonical_report(df, stylise=False, report_header='%_Dom', condition=">0.995")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >99.5% predominance removes {}'.format(report))

# dynamically capture what will be removed through the nulls filter
report = tr.canonical_report(df, stylise=False, report_header='%_Null', condition=">0.95")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >95% nulls removes {}'.format(report))

# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.95, predominant_max=0.995, inplace=True, nulls_list=['']))
tr.report_notes(labels='auto_remove', drop_dates=True)

Unnamed: 0,section,label,text
0,attribute,auto_remove,"With >99.5% predominance removes ['Street', 'Utilities']"
1,,,"With >95% nulls removes ['MiscFeature', 'PoolQC']"


In [17]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=30, null_max=0.998, inplace=True))

In [18]:
# Typing Catagories
# tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [19]:
# Typing Dates 
# tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', inplace=True))

In [20]:
# Type boolean
# tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [21]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True), level=1)

In [22]:
# Type int
tr.set_cleaner(tr.clean.to_int_type(df, dtype=['int'], fillna=-1, inplace=True), level=1)

In [23]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=True, inplace=True), level=1)

In [24]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,replace_spaces=_
1,,auto_remove_columns,"null_min=0.95, predominant_max=0.995, nulls_list=['']"
2,,auto_to_category,"null_max=0.998, unique_max=30"
3,1.0,to_float_type,"dtype=['float'], exclude=False, fillna=nan, errors=coerce, precision=3"
4,,to_int_type,"dtype=['int'], exclude=False, fillna=-1, errors=coerce, precision=15"
5,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['NaN', 'nan', 'null', '', 'None']"


### Report, Persist and Validation

In [25]:
# Create the excel data dictionary
_ = tr.canonical_report(df, to_file=True)

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist()

# save the clean file
tr.save_clean_canonical(df)

In [26]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,1stFlrSF,int64,0.000,0.017,1460,753,max=4692 | min=334 | mean=1162.63
1,2ndFlrSF,int64,0.000,0.568,1460,417,max=2065 | min=0 | mean=346.99
2,3SsnPorch,int64,0.000,0.984,1460,20,max=508 | min=0 | mean=3.41
3,Alley,category,0.938,0.549,91,2,Grvl|Pave
4,BedroomAbvGr,int64,0.000,0.551,1460,8,max=8 | min=0 | mean=2.87
5,BldgType,category,0.000,0.836,1460,5,1Fam|2fmCon|Duplex|Twnhs|TwnhsE
6,BsmtCond,category,0.025,0.921,1423,4,Fa|Gd|Po|TA
7,BsmtExposure,category,0.026,0.670,1422,4,Av|Gd|Mn|No
8,BsmtFinSF1,int64,0.000,0.320,1460,637,max=5644 | min=0 | mean=443.64
9,BsmtFinSF2,int64,0.000,0.886,1460,144,max=1474 | min=0 | mean=46.55


---------
### Re-Running the Contract Pipeline

In [27]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,1stFlrSF,int64,0.0%,1.7%,1460,753,max=4692 | min=334 | mean=1162.63
1,2ndFlrSF,int64,0.0%,56.8%,1460,417,max=2065 | min=0 | mean=346.99
2,3SsnPorch,int64,0.0%,98.4%,1460,20,max=508 | min=0 | mean=3.41
3,Alley,category,93.8%,54.9%,91,2,Grvl|Pave
4,BedroomAbvGr,int64,0.0%,55.1%,1460,8,max=8 | min=0 | mean=2.87
5,BldgType,category,0.0%,83.6%,1460,5,1Fam|2fmCon|Duplex|Twnhs|TwnhsE
6,BsmtCond,category,2.5%,92.1%,1423,4,Fa|Gd|Po|TA
7,BsmtExposure,category,2.6%,67.0%,1422,4,Av|Gd|Mn|No
8,BsmtFinSF1,int64,0.0%,32.0%,1460,637,max=5644 | min=0 | mean=443.64
9,BsmtFinSF2,int64,0.0%,88.6%,1460,144,max=1474 | min=0 | mean=46.55
