In [8]:
%run ../base_setup.ipynb

Foundation: 1.02.021
Transition: 2.03.046


# Accelerated Machine learning
## Transitioning Contract

In [3]:
# create or retrieve the named Transition instance
tr = Transition.from_remote('synthetic_customer', vertical='client')

#### Reset the Contract

In [4]:
tr.reset_transition_contracts()

### Data Source

In [5]:
# Alternative S3
tr.set_source_contract(resource='data/synthetic/synthetic_customer.csv', connector_type='dsv',
                       location='ds-discovery', module_name='ds_connectors.handlers.aws_s3_handlers', handler='S3SourceHandler',
                       delimiter=',', encoding='latin1')
tr.set_persist_contract()

In [6]:
# set the source contract
# tr.set_source_contract(resource='synthetic_customer.dsv', sep=',', encoding='latin1')
# tr.set_persist_contract()

In [7]:
tr.report_source()

Unnamed: 0,param,Property Source,Data Source,Persist Source
0,connector_name,pm_data_synthetic_customer,origin_connector,persist_connector
1,resource,config_transition_datasynthetic_customer.yaml,data/synthetic/synthetic_customer.csv,transition_synthetic_customer_v0.00.pkl
2,connector_type,yaml,dsv,pickle
3,location,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/contracts/synthetic_customer,ds-discovery,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/data/1_transition
4,module_name,ds_discovery.handlers.pandas_handlers,ds_connectors.handlers.aws_s3_handlers,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler,S3SourceHandler,PandasPersistHandler
6,modified,0,0,0
7,kwargs,,"delimiter=',' encoding='latin1'",


### Retrieve & Observations

In [7]:
tr.data_pm.get_connector_handler('origin_connector').supported_types()

['csv', 'dsv', 'pickle', 'parquet']

In [8]:
# load the source canonical
df = tr.load_source_canonical()

In [None]:
tr.discover.analyse_number(df[])

In [9]:
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,object,15.0%,0.5%,500,423,Sample: 58.623 | 56.213 | 60.139
1,balance,object,0.0%,0.4%,500,496,Sample: 146.3 | 43.86 | 113.42
2,forename,object,0.0%,0.4%,500,499,Sample: Tamara | Ashwin | Ifan
3,gender,object,0.0%,62.2%,500,2,Sample: M | F
4,id,object,0.0%,0.2%,500,500,Sample: CU_1804130 | CU_2047146 | CU_5968492
5,last_login,object,0.0%,0.2%,500,500,Sample: 03-24-19 17:38 | 04-13-19 22:35 | 02-11-19 21:51
6,,object,100.0%,0.0%,500,1,Sample:
7,online,object,0.0%,78.8%,500,2,Sample: 1 | 0
8,profession,object,10.0%,24.2%,500,16,Sample: Assistant Manager | VP Product Management |
9,single cat,object,40.0%,100.0%,500,2,Sample: | A


In [13]:
tr.add_notes(text='The file is a synthetic customer data file created for this demonstration')

In [14]:
tr.add_notes(label='source', text='This was generated using the Discovery Behavioral Synthetic Data Generator')
tr.add_notes(label='source', text='The script to rerun the data generation can be found in the synthetic scripts folder')

In [15]:
tr.add_attribute_notes(attribute='start', text="changing this to start_date so it being a date is obvious")

In [16]:
tr.report_notes(regex='With')

KeyError: "None of [Index(['section'], dtype='object')] are in the [columns]"

<pandas.io.formats.style.Styler at 0x1a2009feb8>

------------
### Selection, Filter and Typing

In [17]:
# auto clean headers
tr.set_cleaner(tr.clean.auto_clean_header(df, rename_map={'start': 'start_date'}, inplace=True))

In [18]:
# dynamically capture what will be removed through the predominant filter
report = tr.canonical_report(df, stylise=False, report_header='%_Dom', condition=">0.90")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >90% predominance removes {}'.format(report))

# dynamically capture what will be removed through the nulls filter
report = tr.canonical_report(df, stylise=False, report_header='%_Null', condition=">0.99")['Attribute'].tolist()
if len(report) > 0:
    tr.add_notes(note_type='attribute', label='auto_remove', text='With >99% nulls removes {}'.format(report))

# auto remove columns
tr.set_cleaner(tr.clean.auto_remove_columns(df, null_min=0.99, predominant_max=0.90,inplace=True, nulls_list=['']))

In [19]:
# auto type categories
tr.set_cleaner(tr.clean.auto_to_category(df, unique_max=20, null_max=0.7, inplace=True))

In [20]:
# Typing Catagories
tr.set_cleaner(tr.clean.to_category_type(df, headers=['gender', 'profession'], inplace=True))

In [21]:
# Typing Dates 
tr.set_cleaner(tr.clean.to_date_type(df, headers='start_date', day_first=True, inplace=True))

In [22]:
# Type boolean
tr.set_cleaner(tr.clean.to_bool_type(df, bool_map={1: True}, headers='online', inplace=True))

In [23]:
# Type Floats
tr.set_cleaner(tr.clean.to_float_type(df, dtype=['float'], precision=3, inplace=True))

In [24]:
# Type Integers
tr.set_cleaner(tr.clean.to_str_type(df, dtype=['object'], nulls_list=['', 'nan'], inplace=True))

In [25]:
# Cleaners report
tr.report_cleaners()

Unnamed: 0,level,intent,parameters
0,0.0,auto_clean_header,"rename_map={'start': 'start_date'}, replace_spaces=_"
1,,auto_remove_columns,"null_min=0.99, predominant_max=0.9, nulls_list=['']"
2,,auto_to_category,"null_max=0.7, unique_max=20"
3,,to_bool_type,"headers=online, drop=False, bool_map={1: True}"
4,,to_category_type,"headers=['gender', 'profession'], drop=False"
5,,to_date_type,"headers=start_date, drop=False, as_num=False, day_first=True, year_first=False"
6,,to_float_type,"dtype=['float'], exclude=False, fillna=nan, errors=coerce, precision=3"
7,,to_str_type,"dtype=['object'], exclude=False, nulls_list=['', 'nan']"


### Bulk upload the Attribute Dictionary

In [26]:
# load the Transitioning instance
tr_dict = Transition.from_env('synthetic_customer_dictionary')


In [27]:
tr_dict.report_source()

Unnamed: 0,param,Property Source
0,connector_name,pm_data_synthetic_customer_dictionary
1,resource,config_transition_datasynthetic_customer_dictionary.yaml
2,connector_type,yaml
3,location,/Users/doatridge/code/projects/prod/discovery-transition-ds/jupyter/working/contracts/synthetic_customer_dictionary
4,module_name,ds_discovery.handlers.pandas_handlers
5,handler,PandasPersistHandler
6,modified,0
7,kwargs,


In [28]:
# get the Transitioned Canonical
df_cust_dict = tr_dict.load_clean_canonical()
tr.canonical_report(df_cust_dict, stylise=False)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations


In [29]:
# bulk upload the dictionary into our notes
tr.upload_notes(df_cust_dict, label_header='attribute', text_header='description', note_type='dictionary', selection=df.columns.to_list())

ValueError: The label header attribute can't be found in the DataFrame

In [None]:
tr.report_notes(stylise=True)

### Report, Persist and Validation

In [30]:

# persist Transitioning Contract just to ensure everything in memory is persisted to disk
tr.persist_contract()

# save the clean file
tr.save_clean_canonical(df)

In [33]:
?tr.discover.analyse_association

[0;31mSignature:[0m [0mtr[0m[0;34m.[0m[0mdiscover[0m[0;34m.[0m[0manalyse_association[0m[0;34m([0m[0mdf[0m[0;34m:[0m[0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0mcolumns_list[0m[0;34m:[0m[0mlist[0m[0;34m,[0m [0mexclude_associate[0m[0;34m:[0m[0mlist[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Analyses the association of Category against Values and returns a dictionary of resulting weighting
the structure of the columns_list is a list of dictionaries with the key words
    - label: the label or name of the header in the DataFrame
    - dtype: one of category|number|date indicating the origin of the data
    - chunk_size: if the weighting pattern is over the size of the data the number of chunks
    - replace_zero: if a zero reference is returned it can optionally be replaced with a low probability
and example structure might look like:
    [{'label1':

In [31]:
# check the save worked by loading back the clean canonical
df = tr.load_clean_canonical()
tr.canonical_report(df, stylise=True)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,object,15.0%,0.5%,425,422,Sample: 45.393 | 79.16 | 57.258
1,balance,object,0.0%,0.4%,500,496,Sample: 100.81 | 133.6 | 119.28
2,forename,object,0.0%,0.4%,500,499,Sample: Jonas | Habiba | Iva
3,gender,category,0.0%,62.2%,500,2,F|M
4,id,object,0.0%,0.2%,500,500,Sample: CU_2313118 | CU_5601884 | CU_2349662
5,last_login,object,0.0%,0.2%,500,500,Sample: 01-22-19 16:36 | 03-29-19 06:23 | 04-17-19 20:00
6,online,bool,0.0%,100.0%,500,1,False
7,profession,category,10.0%,24.2%,500,16,|Actuary|Assistant Manager|Assistant Professor|Dental Hygienist|Design Engineer|Librarian|Nurse Prac...
8,start_date,datetime64[ns],0.0%,1.4%,500,271,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
9,status,category,0.0%,54.6%,500,4,Active|Closed|Pending|Suspended


---------
### Re-Running the Pipeline Contract

In [27]:
# loads the raw canonical, run the contract pipeline, save and return the clean canonical
df = tr.refresh_clean_canonical()
tr.canonical_report(df)

Unnamed: 0,Attribute,dType,%_Null,%_Dom,Count,Unique,Observations
0,age,float64,15.0%,0.5%,425,422,max=85.381 | min=20.468 | mean=47.12
1,balance,float64,0.0%,0.4%,500,496,max=992.79 | min=31.57 | mean=187.96
2,forename,object,0.0%,0.4%,500,499,Sample: Kurtis | Jarrod | Kyan
3,gender,category,0.0%,62.2%,500,2,F|M
4,id,object,0.0%,0.2%,500,500,Sample: CU_3662615 | CU_9775428 | CU_8515098
5,last_login,object,0.0%,0.2%,500,500,Sample: 04-16-19 08:32 | 04-16-19 00:33 | 03-25-19 15:36
6,online,bool,0.0%,78.8%,500,2,True | False
7,profession,category,10.0%,24.2%,450,15,Actuary|Assistant Manager|Assistant Professor|Dental Hygienist|Design Engineer|Librarian|Nurse Pract...
8,start_date,datetime64[ns],0.0%,1.4%,500,271,max=2018-12-30 00:00:00 | min=2018-01-01 00:00:00 | yr mean= 2018
9,status,category,0.0%,54.6%,500,4,Active|Closed|Pending|Suspended
